00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/ui/GUIFeatures.h>
00013 #include <shogun/ui/SGInterface.h>
00014
00015 #include <shogun/lib/config.h>
00016 #include <shogun/io/SGIO.h>
00017 #include <shogun/io/AsciiFile.h>
00018
00019 using namespace shogun;
00020
00021 CGUIFeatures::CGUIFeatures(CSGInterface* ui_)
00022 : CSGObject(), ui(ui_), train_features(NULL), test_features(NULL),
00023 ref_features(NULL)
00024 {
00025 }
00026
00027 CGUIFeatures::~CGUIFeatures()
00028 {
00029 SG_UNREF(train_features);
00030 SG_UNREF(test_features);
00031 SG_UNREF(ref_features);
00032 }
00033
00034 void CGUIFeatures::invalidate_train()
00035 {
00036 CKernel *k = ui->ui_kernel->get_kernel();
00037 if (k)
00038 k->remove_lhs();
00039 }
00040
00041 void CGUIFeatures::invalidate_test()
00042 {
00043 CKernel *k = ui->ui_kernel->get_kernel();
00044 if (k)
00045 k->remove_rhs();
00046 }
00047
00048 bool CGUIFeatures::load(
00049 char* filename, char* fclass, char* type, char* target, int32_t size,
00050 int32_t comp_features)
00051 {
00052 bool result=false;
00053 CFeatures** f_ptr=NULL;
00054
00055 if (strncmp(target, "TRAIN", 5)==0)
00056 {
00057 f_ptr=&train_features;
00058 invalidate_train();
00059 }
00060 else if (strncmp(target, "TEST", 4)==0)
00061 {
00062 f_ptr=&test_features;
00063 invalidate_test();
00064 }
00065 else
00066 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
00067
00068 SG_UNREF(*f_ptr);
00069 *f_ptr=NULL;
00070
00071 CAsciiFile* file=new CAsciiFile(filename);
00072 if (strncmp(fclass, "SIMPLE", 6)==0)
00073 {
00074 if (strncmp(type, "REAL", 4)==0)
00075 {
00076 *f_ptr=new CDenseFeatures<float64_t>(file);
00077 }
00078 else if (strncmp(type, "BYTE", 4)==0)
00079 {
00081 *f_ptr=new CDenseFeatures<uint8_t>(file);
00082 }
00083 else if (strncmp(type, "CHAR", 4)==0)
00084 {
00086 *f_ptr=new CDenseFeatures<char>(file);
00087 }
00088 else if (strncmp(type, "SHORT", 5)==0)
00089 {
00090 *f_ptr=new CDenseFeatures<int16_t>(file);
00091 }
00092 else
00093 {
00094 SG_ERROR("Unknown type.\n");
00095 return false;
00096 }
00097 }
00098 else if (strncmp(fclass, "SPARSE", 6)==0)
00099 {
00100 SG_NOTIMPLEMENTED;
00101 }
00102 else if (strncmp(fclass, "STRING", 6)==0)
00103 {
00104 if (strncmp(type, "REAL", 4)==0)
00105 {
00106 *f_ptr=new CStringFeatures<float64_t>(file);
00107 }
00108 else if (strncmp(type, "BYTE", 4)==0)
00109 {
00111 *f_ptr=new CStringFeatures<uint8_t>(file, DNA);
00112 }
00113 else if (strncmp(type, "CHAR", 4)==0)
00114 {
00116 *f_ptr=new CStringFeatures<char>(file, DNA);
00117 }
00118 else if (strncmp(type, "SHORT", 5)==0)
00119 {
00120 *f_ptr=new CStringFeatures<int16_t>(file);
00121 }
00122 else if (strncmp(type, "WORD", 4)==0)
00123 {
00124 *f_ptr=new CStringFeatures<uint16_t>(file);
00125 }
00126 else if (strncmp(type, "ULONG", 5)==0)
00127 {
00128 *f_ptr=new CStringFeatures<uint64_t>(file);
00129 }
00130 else
00131 {
00132 SG_ERROR("Unknown type.\n");
00133 return false;
00134 }
00135 }
00136 SG_UNREF(file);
00137
00138 return result;
00139 }
00140
00141 bool CGUIFeatures::save(char* filename, char* type, char* target)
00142 {
00143 bool result=false;
00144
00145 CFeatures** f_ptr=NULL;
00146
00147 if (strncmp(target, "TRAIN", 5)==0)
00148 {
00149 f_ptr=&train_features;
00150 }
00151 else if (strncmp(target, "TEST", 4)==0)
00152 {
00153 f_ptr=&test_features;
00154 }
00155 else
00156 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
00157
00158 if (*f_ptr)
00159 {
00160 try
00161 {
00162 CAsciiFile* file=new CAsciiFile(filename, 'w');
00163 if (strncmp(type, "REAL", 4)==0)
00164 {
00165 ((CDenseFeatures<float64_t>*) (*f_ptr))->save(file);
00166 }
00167 else if (strncmp(type, "BYTE", 4)==0)
00168 {
00169 ((CDenseFeatures<uint8_t>*) (*f_ptr))->save(file);
00170 }
00171 else if (strncmp(type, "CHAR", 4)==0)
00172 {
00173 ((CDenseFeatures<char>*) (*f_ptr))->save(file);
00174 }
00175 else if (strncmp(type, "SHORT", 5)==0)
00176 {
00177 ((CDenseFeatures<int16_t>*) (*f_ptr))->save(file);
00178 }
00179 else if (strncmp(type, "WORD", 4)==0)
00180 {
00181 ((CDenseFeatures<uint16_t>*) (*f_ptr))->save(file);
00182 }
00183 else
00184 {
00185 SG_ERROR("Unknown type.\n");
00186 return false;
00187 }
00188 SG_UNREF(file);
00189 }
00190 catch (...)
00191 {
00192 SG_ERROR("Writing to file %s failed!\n", filename);
00193 }
00194
00195 SG_INFO( "Successfully written features into \"%s\" !\n", filename);
00196 result=true;
00197
00198 } else
00199 SG_ERROR("Set features first.\n");
00200
00201 return result;
00202 }
00203
00204 bool CGUIFeatures::clean(char* target)
00205 {
00206 if (strncmp(target, "TRAIN", 5)==0)
00207 set_train_features(NULL);
00208 else if (strncmp(target, "TEST", 4)==0)
00209 set_test_features(NULL);
00210 else
00211 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
00212
00213 return true;
00214 }
00215
00216 bool CGUIFeatures::reshape(char* target, int32_t num_feat, int32_t num_vec)
00217 {
00218 CFeatures** f_ptr=NULL;
00219
00220 if (strncmp(target, "TRAIN", 5)==0)
00221 {
00222 f_ptr=&train_features;
00223 invalidate_train();
00224 }
00225 else if (strncmp(target, "TEST", 4)==0)
00226 {
00227 f_ptr=&test_features;
00228 invalidate_test();
00229 }
00230 else
00231 {
00232 SG_ERROR("Invalid target %s\n", target);
00233 return false;
00234 }
00235
00236 bool result=false;
00237 if (f_ptr)
00238 {
00239 SG_INFO( "reshape data to %d x %d\n", num_feat, num_vec);
00240 result=(*f_ptr)->reshape(num_feat, num_vec);
00241
00242 if (!result)
00243 SG_ERROR("Reshaping failed.\n");
00244 }
00245
00246 return result;
00247 }
00248
00249 CFeatures* CGUIFeatures::get_convert_features(char* target)
00250 {
00251 CFeatures* features;
00252
00253 if (strncmp(target, "TEST", 4)==0)
00254 features=get_test_features();
00255 else if (strncmp(target, "TRAIN", 5)==0)
00256 features=get_train_features();
00257 else
00258 return NULL;
00259
00260 if (features->get_feature_class()==C_COMBINED)
00261 features=((CCombinedFeatures*) features)->get_last_feature_obj();
00262
00263 return features;
00264 }
00265
00266 bool CGUIFeatures::set_convert_features(CFeatures* features, char* target)
00267 {
00268 CFeatures* features_prev;
00269
00270 if (strncmp(target, "TEST", 4)==0)
00271 features_prev=get_test_features();
00272 else if (strncmp(target, "TRAIN", 5)==0)
00273 features_prev=get_train_features();
00274 else
00275 return false;
00276
00277
00278
00279
00280 if (features_prev->get_feature_class()==C_COMBINED)
00281 {
00282 CCombinedFeatures* combined=(CCombinedFeatures*) features_prev;
00283 combined->delete_feature_obj();
00284 combined->append_feature_obj(features);
00285 combined->list_feature_objs();
00286 }
00287 else
00288 {
00289 if (strncmp(target, "TEST", 4)==0)
00290 set_test_features(features);
00291 else
00292 set_train_features(features);
00293 }
00294
00295 return true;
00296 }
00297
00298 CSparseFeatures<float64_t>* CGUIFeatures::convert_simple_real_to_sparse_real(
00299 CDenseFeatures<float64_t>* src)
00300 {
00301 if (src &&
00302 src->get_feature_class()==C_DENSE &&
00303 src->get_feature_type()==F_DREAL)
00304 {
00305
00306 SG_INFO("Attempting to convert dense feature matrix to a sparse one.\n");
00307 CSparseFeatures<float64_t>* target=new CSparseFeatures<float64_t>(0);
00308 int32_t num_f=0;
00309 int32_t num_v=0;
00310 float64_t* feats=src->get_feature_matrix(num_f, num_v);
00311 if (target->set_full_feature_matrix(SGMatrix<float64_t>(feats, num_f, num_v)))
00312 return target;
00313
00314 SG_UNREF(target);
00315 }
00316 else
00317 SG_ERROR("No SIMPLE DREAL features available.\n");
00318
00319 return NULL;
00320 }
00321
00322 CStringFeatures<char>* CGUIFeatures::convert_simple_char_to_string_char(
00323 CDenseFeatures<char>* src)
00324 {
00325 if (src && src->get_feature_class()==C_DENSE)
00326 {
00327 int32_t num_vec=src->get_num_vectors();
00328 SGString<char>* strings=SG_MALLOC(SGString<char>, num_vec);
00329 int32_t max_len=-1;
00330
00331 for (int32_t i=0; i<num_vec; i++)
00332 {
00333 bool to_free=false;
00334 int32_t len=0;
00335 char* str=src->get_feature_vector(i, len, to_free);
00336 strings[i].slen=len ;
00337 for (int32_t j=0; j<len; j++)
00338 if (str[j]==0)
00339 {
00340 strings[i].slen=j ;
00341 break ;
00342 } ;
00343 strings[i].string=SG_MALLOC(char, strings[i].slen);
00344
00345 for (int32_t j=0; j<strings[i].slen; j++)
00346 strings[i].string[j]=str[j];
00347
00348 if (strings[i].slen> max_len)
00349 max_len=strings[i].slen;
00350
00351 src->free_feature_vector(str, i, to_free);
00352 }
00353
00354 CStringFeatures<char>* target=new CStringFeatures<char>(new CAlphabet(DNA));
00355 target->set_features(strings, num_vec, max_len);
00356 return target;
00357 }
00358 else
00359 SG_ERROR("No features of class/type SIMPLE/CHAR available.\n");
00360
00361 return NULL;
00362 }
00363
00364 CDenseFeatures<float64_t>* CGUIFeatures::convert_simple_word_to_simple_salzberg(
00365 CDenseFeatures<uint16_t>* src)
00366 {
00367 CPluginEstimate* pie=ui->ui_pluginestimate->get_estimator();
00368
00369 if (src &&
00370 src->get_feature_type()==F_WORD &&
00371 src->get_feature_class()==C_DENSE &&
00372 pie)
00373 {
00374 CDenseFeatures<float64_t>* target=new CDenseFeatures<float64_t>(0);
00375 int32_t num_feat=src->get_num_features();
00376 int32_t num_vec=src->get_num_vectors();
00377 float64_t* fm=SG_MALLOC(float64_t, num_vec*num_feat);
00378
00379 if (fm)
00380 {
00381 for (int32_t i=0; i<num_vec; i++)
00382 {
00383 int32_t len=0;
00384 bool to_free=false;
00385 uint16_t* vec = src->get_feature_vector(i, len, to_free);
00386 ASSERT(num_feat==len);
00387
00388 for (int32_t j=0; j<num_feat; j++)
00389 fm[i*num_feat+j]=
00390 pie->get_parameterwise_log_odds(vec[j], j);
00391
00392 src->free_feature_vector(vec, i, to_free);
00393 }
00394 target->set_feature_matrix(SGMatrix<float64_t>(fm, num_feat, num_vec));
00395
00396 }
00397 return target;
00398 }
00399 else
00400 SG_ERROR("No SIMPLE WORD features or PluginEstimator available.\n");
00401
00402 return NULL;
00403 }
00404
00405
00406 CTOPFeatures* CGUIFeatures::convert_string_word_to_simple_top(
00407 CStringFeatures<uint16_t>* src)
00408 {
00409 CTOPFeatures* tf=NULL;
00410
00411 if (src &&
00412 src->get_feature_class()==C_DENSE &&
00413 src->get_feature_type()==F_WORD)
00414 {
00415 SG_INFO("Converting to TOP features.\n");
00416
00417 if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg())
00418 {
00419 ui->ui_hmm->get_pos()->set_observations(src);
00420 ui->ui_hmm->get_neg()->set_observations(src);
00421
00422 bool neglinear=false;
00423 bool poslinear=false;
00424
00425 tf=new CTOPFeatures(
00426 0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg(),
00427 neglinear, poslinear);
00428 ASSERT(tf->set_feature_matrix());
00429 }
00430 else
00431 SG_ERROR("HMMs not correctly assigned!\n");
00432 }
00433 else
00434 SG_ERROR("No SIMPLE WORD features available.\n");
00435
00436 return tf;
00437 }
00438
00439 CFKFeatures* CGUIFeatures::convert_string_word_to_simple_fk(
00440 CStringFeatures<uint16_t>* src)
00441 {
00442 CFKFeatures* fkf=NULL;
00443
00444 SG_INFO("Converting to FK features.\n");
00445
00446 if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg())
00447 {
00448 CStringFeatures<uint16_t>* old_obs_pos=
00449 ui->ui_hmm->get_pos()->get_observations();
00450 CStringFeatures<uint16_t>* old_obs_neg=
00451 ui->ui_hmm->get_neg()->get_observations();
00452
00453 CStringFeatures<uint16_t>* string_feat=src;
00454 ui->ui_hmm->get_pos()->set_observations(string_feat);
00455 ui->ui_hmm->get_neg()->set_observations(string_feat);
00456
00457 fkf=new CFKFeatures(
00458 0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg());
00459
00460 if (train_features)
00461 fkf->set_opt_a(((CFKFeatures*) train_features)->get_weight_a());
00462 else
00463 SG_ERROR("Need train features to set optimal a.\n");
00464
00465 ASSERT(fkf->set_feature_matrix());
00466
00467 ui->ui_hmm->get_pos()->set_observations(old_obs_pos);
00468 ui->ui_hmm->get_neg()->set_observations(old_obs_neg);
00469 }
00470 else
00471 SG_ERROR("HMMs not correctly assigned!\n");
00472
00473 return fkf;
00474 }
00475
00476
00477 CDenseFeatures<float64_t>* CGUIFeatures::convert_sparse_real_to_simple_real(
00478 CSparseFeatures<float64_t>* src)
00479 {
00480 if (src &&
00481 src->get_feature_class()==C_SPARSE &&
00482 src->get_feature_type() == F_DREAL)
00483 {
00484
00485 SG_INFO("Attempting to convert sparse feature matrix to a dense one.\n");
00486 CDenseFeatures<float64_t>* rf=new CDenseFeatures<float64_t>(0);
00487 if (rf)
00488 {
00489 SGMatrix<float64_t> feats=src->get_full_feature_matrix();
00490 rf->set_feature_matrix(feats);
00491 return rf;
00492 }
00493 }
00494 else
00495 SG_ERROR("No SPARSE REAL features available.\n");
00496
00497 return NULL;
00498 }
00499
00500 CExplicitSpecFeatures* CGUIFeatures::convert_string_byte_to_spec_word(
00501 CStringFeatures<uint16_t>* src, bool use_norm)
00502 {
00503 return new CExplicitSpecFeatures(src, use_norm);
00504 }
00505
00506 CDenseFeatures<float64_t>* CGUIFeatures::convert_simple_char_to_simple_align(
00507 CDenseFeatures<char>* src, float64_t gap_cost)
00508 {
00509 if (src &&
00510 src->get_feature_class()==C_DENSE &&
00511 src->get_feature_type()==F_CHAR)
00512 {
00513
00514 SG_INFO("Converting CHAR features to REAL ones.\n");
00515
00516 CDenseFeatures<float64_t>* rf=new CDenseFeatures<float64_t>(0);
00517 if (rf)
00518 {
00519 SG_INFO("Start aligment with gapCost=%1.2f.\n", gap_cost);
00520
00521
00522 SG_INFO("Conversion was successful.\n");
00523 return rf;
00524 }
00525 }
00526 else
00527 SG_ERROR("No SIMPLE CHAR features available.\n");
00528
00529 SG_ERROR("Conversion failed.\n");
00530 return NULL;
00531 }
00532
00533 bool CGUIFeatures::set_reference_features(char* target)
00534 {
00535 if (strncmp(target, "TRAIN", 5)==0)
00536 {
00537 SG_UNREF(ref_features);
00538 ref_features=train_features;
00539 train_features=NULL;
00540 invalidate_train();
00541 return true;
00542 }
00543 else if (strncmp(target, "TEST", 4)==0)
00544 {
00545 SG_UNREF(ref_features);
00546 ref_features=test_features;
00547 test_features=NULL;
00548 invalidate_test();
00549 return true;
00550 }
00551
00552 return false;
00553 }
00554
00555 void CGUIFeatures::add_train_features(CFeatures* f)
00556 {
00557 ASSERT(f);
00558 invalidate_train();
00559
00560 if (!train_features)
00561 {
00562 train_features=new CCombinedFeatures();
00563 SG_REF(train_features);
00564 }
00565
00566 if (train_features->get_feature_class()!=C_COMBINED)
00567 {
00568 CFeatures* first_elem=train_features;
00569 train_features=new CCombinedFeatures();
00570 SG_REF(train_features);
00571 ((CCombinedFeatures*) train_features)->append_feature_obj(first_elem);
00572 ((CCombinedFeatures*) train_features)->list_feature_objs();
00573 SG_UNREF(first_elem);
00574 }
00575
00576 bool result=((CCombinedFeatures*) train_features)->append_feature_obj(f);
00577 if (result)
00578 ((CCombinedFeatures*) train_features)->list_feature_objs();
00579 else
00580 SG_ERROR("appending feature object failed\n");
00581 }
00582
00583 void CGUIFeatures::add_train_dotfeatures(CDotFeatures* f)
00584 {
00585 ASSERT(f);
00586 SG_PRINT("DOTFVEC %d\n", f->get_num_vectors());
00587 invalidate_train();
00588
00589 if (!train_features)
00590 {
00591 train_features=new CCombinedDotFeatures();
00592 SG_REF(train_features);
00593 }
00594
00595 if (train_features->get_feature_class()!=C_COMBINED_DOT)
00596 {
00597 if (!train_features->has_property(FP_DOT))
00598 SG_ERROR("Trainfeatures not based on DotFeatures.\n");
00599
00600 CDotFeatures* first_elem=(CDotFeatures*) train_features;
00601 train_features=new CCombinedDotFeatures();
00602 SG_REF(train_features);
00603 ((CCombinedDotFeatures*) train_features)->append_feature_obj(first_elem);
00604 ((CCombinedDotFeatures*) train_features)->list_feature_objs();
00605 SG_UNREF(first_elem);
00606 }
00607
00608 bool result=((CCombinedDotFeatures*) train_features)->append_feature_obj(f);
00609 if (result)
00610 ((CCombinedDotFeatures*) train_features)->list_feature_objs();
00611 else
00612 SG_ERROR("appending dot feature object failed\n");
00613 }
00614
00615 void CGUIFeatures::add_test_dotfeatures(CDotFeatures* f)
00616 {
00617 ASSERT(f);
00618 invalidate_test();
00619
00620 if (!test_features)
00621 {
00622 test_features=new CCombinedDotFeatures();
00623 SG_REF(test_features);
00624 }
00625
00626 if (test_features->get_feature_class()!=C_COMBINED_DOT)
00627 {
00628 if (!test_features->has_property(FP_DOT))
00629 SG_ERROR("Trainfeatures not based on DotFeatures.\n");
00630
00631 CDotFeatures* first_elem=(CDotFeatures*) test_features;
00632 test_features=new CCombinedDotFeatures();
00633 SG_REF(test_features);
00634 ((CCombinedDotFeatures*) test_features)->append_feature_obj(first_elem);
00635 ((CCombinedDotFeatures*) test_features)->list_feature_objs();
00636 SG_UNREF(first_elem);
00637 }
00638
00639 bool result=((CCombinedDotFeatures*) test_features)->append_feature_obj(f);
00640 if (result)
00641 ((CCombinedDotFeatures*) test_features)->list_feature_objs();
00642 else
00643 SG_ERROR("Appending feature object failed.\n");
00644 }
00645
00646 void CGUIFeatures::add_test_features(CFeatures* f)
00647 {
00648 ASSERT(f);
00649 invalidate_test();
00650
00651 if (!test_features)
00652 {
00653 test_features=new CCombinedFeatures();
00654 SG_REF(test_features);
00655 }
00656
00657 if (test_features->get_feature_class()!=C_COMBINED)
00658 {
00659 CFeatures* first_elem=test_features;
00660 test_features=new CCombinedFeatures();
00661 SG_REF(test_features);
00662 ((CCombinedFeatures*) test_features)->append_feature_obj(first_elem);
00663 ((CCombinedFeatures*) test_features)->list_feature_objs();
00664 SG_UNREF(first_elem);
00665 }
00666
00667 bool result=((CCombinedFeatures*) test_features)->append_feature_obj(f);
00668 if (result)
00669 ((CCombinedFeatures*) test_features)->list_feature_objs();
00670 else
00671 SG_ERROR("Appending feature object failed.\n");
00672 }
00673
00674 bool CGUIFeatures::del_last_feature_obj(char* target)
00675 {
00676 CCombinedFeatures* cf=NULL;
00677 if (strncmp(target, "TRAIN", 5)==0)
00678 {
00679 if (!train_features)
00680 SG_ERROR("No train features available.\n");
00681 if (train_features->get_feature_class()!=C_COMBINED)
00682 SG_ERROR("Train features are not combined features.\n");
00683
00684 cf=(CCombinedFeatures*) train_features;
00685 }
00686 else if (strncmp(target, "TEST", 4)==0)
00687 {
00688 if (!test_features)
00689 SG_ERROR("No test features available.\n");
00690 if (test_features->get_feature_class()!=C_COMBINED)
00691 SG_ERROR("Test features are not combined features.\n");
00692
00693 cf=(CCombinedFeatures*) test_features;
00694 }
00695 else
00696 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target);
00697
00698 if (!cf->delete_feature_obj())
00699 SG_ERROR("No features available to delete.\n");
00700
00701 return false;
00702 }