SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GUIFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <shogun/ui/GUIFeatures.h>
13 #include <shogun/ui/SGInterface.h>
14 
15 #include <shogun/lib/config.h>
16 #include <shogun/io/SGIO.h>
17 #include <shogun/io/AsciiFile.h>
18 
19 using namespace shogun;
20 
21 CGUIFeatures::CGUIFeatures(CSGInterface* ui_)
22 : CSGObject(), ui(ui_), train_features(NULL), test_features(NULL),
23  ref_features(NULL)
24 {
25 }
26 
28 {
32 }
33 
35 {
36  CKernel *k = ui->ui_kernel->get_kernel();
37  if (k)
38  k->remove_lhs();
39 }
40 
42 {
43  CKernel *k = ui->ui_kernel->get_kernel();
44  if (k)
45  k->remove_rhs();
46 }
47 
49  char* filename, char* fclass, char* type, char* target, int32_t size,
50  int32_t comp_features)
51 {
52  bool result=false;
53  CFeatures** f_ptr=NULL;
54 
55  if (strncmp(target, "TRAIN", 5)==0)
56  {
57  f_ptr=&train_features;
59  }
60  else if (strncmp(target, "TEST", 4)==0)
61  {
62  f_ptr=&test_features;
64  }
65  else
66  SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
67 
68  SG_UNREF(*f_ptr);
69  *f_ptr=NULL;
70 
71  CAsciiFile* file=new CAsciiFile(filename);
72  if (strncmp(fclass, "SIMPLE", 6)==0)
73  {
74  if (strncmp(type, "REAL", 4)==0)
75  {
76  *f_ptr=new CDenseFeatures<float64_t>(file);
77  }
78  else if (strncmp(type, "BYTE", 4)==0)
79  {
81  *f_ptr=new CDenseFeatures<uint8_t>(file);
82  }
83  else if (strncmp(type, "CHAR", 4)==0)
84  {
86  *f_ptr=new CDenseFeatures<char>(file);
87  }
88  else if (strncmp(type, "SHORT", 5)==0)
89  {
90  *f_ptr=new CDenseFeatures<int16_t>(file);
91  }
92  else
93  {
94  SG_ERROR("Unknown type.\n");
95  return false;
96  }
97  }
98  else if (strncmp(fclass, "SPARSE", 6)==0)
99  {
101  }
102  else if (strncmp(fclass, "STRING", 6)==0)
103  {
104  if (strncmp(type, "REAL", 4)==0)
105  {
106  *f_ptr=new CStringFeatures<float64_t>(file);
107  }
108  else if (strncmp(type, "BYTE", 4)==0)
109  {
111  *f_ptr=new CStringFeatures<uint8_t>(file, DNA);
112  }
113  else if (strncmp(type, "CHAR", 4)==0)
114  {
116  *f_ptr=new CStringFeatures<char>(file, DNA);
117  }
118  else if (strncmp(type, "SHORT", 5)==0)
119  {
120  *f_ptr=new CStringFeatures<int16_t>(file);
121  }
122  else if (strncmp(type, "WORD", 4)==0)
123  {
124  *f_ptr=new CStringFeatures<uint16_t>(file);
125  }
126  else if (strncmp(type, "ULONG", 5)==0)
127  {
128  *f_ptr=new CStringFeatures<uint64_t>(file);
129  }
130  else
131  {
132  SG_ERROR("Unknown type.\n");
133  return false;
134  }
135  }
136  SG_UNREF(file);
137 
138  return result;
139 }
140 
141 bool CGUIFeatures::save(char* filename, char* type, char* target)
142 {
143  bool result=false;
144 
145  CFeatures** f_ptr=NULL;
146 
147  if (strncmp(target, "TRAIN", 5)==0)
148  {
149  f_ptr=&train_features;
150  }
151  else if (strncmp(target, "TEST", 4)==0)
152  {
153  f_ptr=&test_features;
154  }
155  else
156  SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
157 
158  if (*f_ptr)
159  {
160  try
161  {
162  CAsciiFile* file=new CAsciiFile(filename, 'w');
163  if (strncmp(type, "REAL", 4)==0)
164  {
165  ((CDenseFeatures<float64_t>*) (*f_ptr))->save(file);
166  }
167  else if (strncmp(type, "BYTE", 4)==0)
168  {
169  ((CDenseFeatures<uint8_t>*) (*f_ptr))->save(file);
170  }
171  else if (strncmp(type, "CHAR", 4)==0)
172  {
173  ((CDenseFeatures<char>*) (*f_ptr))->save(file);
174  }
175  else if (strncmp(type, "SHORT", 5)==0)
176  {
177  ((CDenseFeatures<int16_t>*) (*f_ptr))->save(file);
178  }
179  else if (strncmp(type, "WORD", 4)==0)
180  {
181  ((CDenseFeatures<uint16_t>*) (*f_ptr))->save(file);
182  }
183  else
184  {
185  SG_ERROR("Unknown type.\n");
186  return false;
187  }
188  SG_UNREF(file);
189  }
190  catch (...)
191  {
192  SG_ERROR("Writing to file %s failed!\n", filename);
193  }
194 
195  SG_INFO( "Successfully written features into \"%s\" !\n", filename);
196  result=true;
197 
198  } else
199  SG_ERROR("Set features first.\n");
200 
201  return result;
202 }
203 
204 bool CGUIFeatures::clean(char* target)
205 {
206  if (strncmp(target, "TRAIN", 5)==0)
207  set_train_features(NULL);
208  else if (strncmp(target, "TEST", 4)==0)
209  set_test_features(NULL);
210  else
211  SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
212 
213  return true;
214 }
215 
216 bool CGUIFeatures::reshape(char* target, int32_t num_feat, int32_t num_vec)
217 {
218  CFeatures** f_ptr=NULL;
219 
220  if (strncmp(target, "TRAIN", 5)==0)
221  {
222  f_ptr=&train_features;
224  }
225  else if (strncmp(target, "TEST", 4)==0)
226  {
227  f_ptr=&test_features;
228  invalidate_test();
229  }
230  else
231  {
232  SG_ERROR("Invalid target %s\n", target);
233  return false;
234  }
235 
236  bool result=false;
237  if (f_ptr)
238  {
239  SG_INFO( "reshape data to %d x %d\n", num_feat, num_vec);
240  result=(*f_ptr)->reshape(num_feat, num_vec);
241 
242  if (!result)
243  SG_ERROR("Reshaping failed.\n");
244  }
245 
246  return result;
247 }
248 
250 {
251  CFeatures* features;
252 
253  if (strncmp(target, "TEST", 4)==0)
254  features=get_test_features();
255  else if (strncmp(target, "TRAIN", 5)==0)
256  features=get_train_features();
257  else
258  return NULL;
259 
260  if (features->get_feature_class()==C_COMBINED)
261  features=((CCombinedFeatures*) features)->get_last_feature_obj();
262 
263  return features;
264 }
265 
266 bool CGUIFeatures::set_convert_features(CFeatures* features, char* target)
267 {
268  CFeatures* features_prev;
269 
270  if (strncmp(target, "TEST", 4)==0)
271  features_prev=get_test_features();
272  else if (strncmp(target, "TRAIN", 5)==0)
273  features_prev=get_train_features();
274  else
275  return false;
276 
277  // in case of combined features delete current (==last) feature obj
278  // pointer from list (feature object got deleted already above)
279  // and append *f_ptr which holds the newly created feature object
280  if (features_prev->get_feature_class()==C_COMBINED)
281  {
282  CCombinedFeatures* combined=(CCombinedFeatures*) features_prev;
283  combined->delete_feature_obj();
284  combined->append_feature_obj(features);
285  combined->list_feature_objs();
286  }
287  else // set features to new test/train features
288  {
289  if (strncmp(target, "TEST", 4)==0)
290  set_test_features(features);
291  else
292  set_train_features(features);
293  }
294 
295  return true;
296 }
297 
300 {
301  if (src &&
302  src->get_feature_class()==C_DENSE &&
303  src->get_feature_type()==F_DREAL)
304  {
305  //create sparse features with 0 cache
306  SG_INFO("Attempting to convert dense feature matrix to a sparse one.\n");
308  int32_t num_f=0;
309  int32_t num_v=0;
310  float64_t* feats=src->get_feature_matrix(num_f, num_v);
311  if (target->set_full_feature_matrix(SGMatrix<float64_t>(feats, num_f, num_v)))
312  return target;
313 
314  SG_UNREF(target);
315  }
316  else
317  SG_ERROR("No SIMPLE DREAL features available.\n");
318 
319  return NULL;
320 }
321 
324 {
325  if (src && src->get_feature_class()==C_DENSE)
326  {
327  int32_t num_vec=src->get_num_vectors();
328  SGString<char>* strings=SG_MALLOC(SGString<char>, num_vec);
329  int32_t max_len=-1;
330 
331  for (int32_t i=0; i<num_vec; i++)
332  {
333  bool to_free=false;
334  int32_t len=0;
335  char* str=src->get_feature_vector(i, len, to_free);
336  strings[i].slen=len ;
337  for (int32_t j=0; j<len; j++)
338  if (str[j]==0)
339  {
340  strings[i].slen=j ;
341  break ;
342  } ;
343  strings[i].string=SG_MALLOC(char, strings[i].slen);
344 
345  for (int32_t j=0; j<strings[i].slen; j++)
346  strings[i].string[j]=str[j];
347 
348  if (strings[i].slen> max_len)
349  max_len=strings[i].slen;
350 
351  src->free_feature_vector(str, i, to_free);
352  }
353 
355  target->set_features(strings, num_vec, max_len);
356  return target;
357  }
358  else
359  SG_ERROR("No features of class/type SIMPLE/CHAR available.\n");
360 
361  return NULL;
362 }
363 
366 {
367  CPluginEstimate* pie=ui->ui_pluginestimate->get_estimator();
368 
369  if (src &&
370  src->get_feature_type()==F_WORD &&
371  src->get_feature_class()==C_DENSE &&
372  pie)
373  {
375  int32_t num_feat=src->get_num_features();
376  int32_t num_vec=src->get_num_vectors();
377  float64_t* fm=SG_MALLOC(float64_t, num_vec*num_feat);
378 
379  if (fm)
380  {
381  for (int32_t i=0; i<num_vec; i++)
382  {
383  int32_t len=0;
384  bool to_free=false;
385  uint16_t* vec = src->get_feature_vector(i, len, to_free);
386  ASSERT(num_feat==len);
387 
388  for (int32_t j=0; j<num_feat; j++)
389  fm[i*num_feat+j]=
390  pie->get_parameterwise_log_odds(vec[j], j);
391 
392  src->free_feature_vector(vec, i, to_free);
393  }
394  target->set_feature_matrix(SGMatrix<float64_t>(fm, num_feat, num_vec));
395 
396  }
397  return target;
398  }
399  else
400  SG_ERROR("No SIMPLE WORD features or PluginEstimator available.\n");
401 
402  return NULL;
403 }
404 
405 
408 {
409  CTOPFeatures* tf=NULL;
410 
411  if (src &&
412  src->get_feature_class()==C_DENSE &&
413  src->get_feature_type()==F_WORD)
414  {
415  SG_INFO("Converting to TOP features.\n");
416 
417  if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg())
418  {
419  ui->ui_hmm->get_pos()->set_observations(src);
420  ui->ui_hmm->get_neg()->set_observations(src);
421 
422  bool neglinear=false;
423  bool poslinear=false;
424 
425  tf=new CTOPFeatures(
426  0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg(),
427  neglinear, poslinear);
428  ASSERT(tf->set_feature_matrix());
429  }
430  else
431  SG_ERROR("HMMs not correctly assigned!\n");
432  }
433  else
434  SG_ERROR("No SIMPLE WORD features available.\n");
435 
436  return tf;
437 }
438 
441 {
442  CFKFeatures* fkf=NULL;
443 
444  SG_INFO("Converting to FK features.\n");
445 
446  if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg())
447  {
448  CStringFeatures<uint16_t>* old_obs_pos=
449  ui->ui_hmm->get_pos()->get_observations();
450  CStringFeatures<uint16_t>* old_obs_neg=
451  ui->ui_hmm->get_neg()->get_observations();
452 
453  CStringFeatures<uint16_t>* string_feat=src;
454  ui->ui_hmm->get_pos()->set_observations(string_feat);
455  ui->ui_hmm->get_neg()->set_observations(string_feat);
456 
457  fkf=new CFKFeatures(
458  0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg());
459  //, neglinear, poslinear);
460  if (train_features)
461  fkf->set_opt_a(((CFKFeatures*) train_features)->get_weight_a());
462  else
463  SG_ERROR("Need train features to set optimal a.\n");
464 
465  ASSERT(fkf->set_feature_matrix());
466 
467  ui->ui_hmm->get_pos()->set_observations(old_obs_pos);
468  ui->ui_hmm->get_neg()->set_observations(old_obs_neg);
469  }
470  else
471  SG_ERROR("HMMs not correctly assigned!\n");
472 
473  return fkf;
474 }
475 
476 
479 {
480  if (src &&
481  src->get_feature_class()==C_SPARSE &&
482  src->get_feature_type() == F_DREAL)
483  {
484  //create dense features with 0 cache
485  SG_INFO("Attempting to convert sparse feature matrix to a dense one.\n");
487  if (rf)
488  {
490  rf->set_feature_matrix(feats);
491  return rf;
492  }
493  }
494  else
495  SG_ERROR("No SPARSE REAL features available.\n");
496 
497  return NULL;
498 }
499 
501  CStringFeatures<uint16_t>* src, bool use_norm)
502 {
503  return new CExplicitSpecFeatures(src, use_norm);
504 }
505 
507  CDenseFeatures<char>* src, float64_t gap_cost)
508 {
509  if (src &&
510  src->get_feature_class()==C_DENSE &&
511  src->get_feature_type()==F_CHAR)
512  {
513  //create dense features with 0 cache
514  SG_INFO("Converting CHAR features to REAL ones.\n");
515 
517  if (rf)
518  {
519  SG_INFO("Start aligment with gapCost=%1.2f.\n", gap_cost);
520  /*rf->Align_char_features(
521  src, (CDenseFeatures<char>*) ref_features, gap_cost);*/
522  SG_INFO("Conversion was successful.\n");
523  return rf;
524  }
525  }
526  else
527  SG_ERROR("No SIMPLE CHAR features available.\n");
528 
529  SG_ERROR("Conversion failed.\n");
530  return NULL;
531 }
532 
534 {
535  if (strncmp(target, "TRAIN", 5)==0)
536  {
539  train_features=NULL;
541  return true;
542  }
543  else if (strncmp(target, "TEST", 4)==0)
544  {
547  test_features=NULL;
548  invalidate_test();
549  return true;
550  }
551 
552  return false;
553 }
554 
556 {
557  ASSERT(f);
559 
560  if (!train_features)
561  {
564  }
565 
567  {
568  CFeatures* first_elem=train_features;
571  ((CCombinedFeatures*) train_features)->append_feature_obj(first_elem);
572  ((CCombinedFeatures*) train_features)->list_feature_objs();
573  SG_UNREF(first_elem);
574  }
575 
576  bool result=((CCombinedFeatures*) train_features)->append_feature_obj(f);
577  if (result)
578  ((CCombinedFeatures*) train_features)->list_feature_objs();
579  else
580  SG_ERROR("appending feature object failed\n");
581 }
582 
584 {
585  ASSERT(f);
586  SG_PRINT("DOTFVEC %d\n", f->get_num_vectors());
588 
589  if (!train_features)
590  {
593  }
594 
596  {
598  SG_ERROR("Trainfeatures not based on DotFeatures.\n");
599 
603  ((CCombinedDotFeatures*) train_features)->append_feature_obj(first_elem);
604  ((CCombinedDotFeatures*) train_features)->list_feature_objs();
605  SG_UNREF(first_elem);
606  }
607 
608  bool result=((CCombinedDotFeatures*) train_features)->append_feature_obj(f);
609  if (result)
610  ((CCombinedDotFeatures*) train_features)->list_feature_objs();
611  else
612  SG_ERROR("appending dot feature object failed\n");
613 }
614 
616 {
617  ASSERT(f);
618  invalidate_test();
619 
620  if (!test_features)
621  {
624  }
625 
627  {
629  SG_ERROR("Trainfeatures not based on DotFeatures.\n");
630 
631  CDotFeatures* first_elem=(CDotFeatures*) test_features;
634  ((CCombinedDotFeatures*) test_features)->append_feature_obj(first_elem);
635  ((CCombinedDotFeatures*) test_features)->list_feature_objs();
636  SG_UNREF(first_elem);
637  }
638 
639  bool result=((CCombinedDotFeatures*) test_features)->append_feature_obj(f);
640  if (result)
641  ((CCombinedDotFeatures*) test_features)->list_feature_objs();
642  else
643  SG_ERROR("Appending feature object failed.\n");
644 }
645 
647 {
648  ASSERT(f);
649  invalidate_test();
650 
651  if (!test_features)
652  {
655  }
656 
658  {
659  CFeatures* first_elem=test_features;
662  ((CCombinedFeatures*) test_features)->append_feature_obj(first_elem);
663  ((CCombinedFeatures*) test_features)->list_feature_objs();
664  SG_UNREF(first_elem);
665  }
666 
667  bool result=((CCombinedFeatures*) test_features)->append_feature_obj(f);
668  if (result)
669  ((CCombinedFeatures*) test_features)->list_feature_objs();
670  else
671  SG_ERROR("Appending feature object failed.\n");
672 }
673 
675 {
676  CCombinedFeatures* cf=NULL;
677  if (strncmp(target, "TRAIN", 5)==0)
678  {
679  if (!train_features)
680  SG_ERROR("No train features available.\n");
682  SG_ERROR("Train features are not combined features.\n");
683 
685  }
686  else if (strncmp(target, "TEST", 4)==0)
687  {
688  if (!test_features)
689  SG_ERROR("No test features available.\n");
691  SG_ERROR("Test features are not combined features.\n");
692 
694  }
695  else
696  SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
697 
698  if (!cf->delete_feature_obj())
699  SG_ERROR("No features available to delete.\n");
700 
701  return false;
702 }

SHOGUN Machine Learning Toolbox - Documentation