StringFileFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Berlin Institute of Technology
00009  */
00010 
00011 #ifndef _CSTRINGFILEFEATURES__H__
00012 #define _CSTRINGFILEFEATURES__H__
00013 
00014 #include <shogun/features/StringFeatures.h>
00015 #include <shogun/features/Alphabet.h>
00016 #include <shogun/io/MemoryMappedFile.h>
00017 #include <shogun/mathematics/Math.h>
00018 #include <shogun/io/SGIO.h>
00019 
00020 namespace shogun
00021 {
00022 class CAlphabet;
00023 template <class T> class CMemoryMappedFile;
00024 
00034 template <class ST> class CStringFileFeatures : public CStringFeatures<ST>
00035 {
00036     public:
00037 
00041     CStringFileFeatures() : CStringFeatures<ST>(), file(NULL)
00042     {
00043     }
00044 
00050     CStringFileFeatures(const char* fname, EAlphabet alpha)
00051     : CStringFeatures<ST>(alpha)
00052     {
00053         file = new CMemoryMappedFile<ST>(fname);
00054         fetch_meta_info_from_file();
00055     }
00056 
00060     virtual ~CStringFileFeatures()
00061     {
00062         SG_UNREF(file);
00063         CStringFileFeatures<ST>::cleanup();
00064     }
00065 
00066     protected:
00081     ST* get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
00082     {
00083         ST* s = file->get_map();
00084         for (uint64_t i=offs; i<file_length; i++)
00085         {
00086             ST c=s[i];
00087 
00088             if (c == '\n')
00089             {
00090                 ST* line=&s[offs];
00091                 len=i-offs;
00092                 offs=i+1;
00093                 line_nr++;
00094                 return line;
00095             }
00096             else
00097             {
00098                 if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c))
00099                 {
00100                     CStringFileFeatures<ST>::cleanup();
00101                     CStringFeatures<ST>::SG_ERROR("Invalid character (%c) in line %d\n", c, line_nr);
00102                 }
00103             }
00104         }
00105 
00106         len=0;
00107         offs=file_length;
00108         return NULL;
00109     }
00110 
00112     virtual void cleanup()
00113     {
00114         CStringFeatures<ST>::num_vectors=0;
00115         SG_FREE(CStringFeatures<ST>::features);
00116         SG_FREE(CStringFeatures<ST>::symbol_mask_table);
00117         CStringFeatures<ST>::features=NULL;
00118         CStringFeatures<ST>::symbol_mask_table=NULL;
00119 
00120         /* start with a fresh alphabet, but instead of emptying the histogram
00121          * create a new object (to leave the alphabet object alone if it is used
00122          * by others)
00123          */
00124         CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet());
00125         SG_UNREF(CStringFeatures<ST>::alphabet);
00126         CStringFeatures<ST>::alphabet=alpha;
00127         SG_REF(CStringFeatures<ST>::alphabet);
00128     }
00129 
00131     virtual void cleanup_feature_vector(int32_t num)
00132     {
00133         CStringFeatures<ST>::SG_ERROR("Cleaning single feature vector not"
00134                 "supported by StringFileFeatures\n");
00135     }
00136 
00141     void fetch_meta_info_from_file(int32_t granularity=1048576)
00142     {
00143         CStringFileFeatures<ST>::cleanup();
00144         uint64_t file_size=file->get_size();
00145         ASSERT(granularity>=1);
00146         ASSERT(CStringFeatures<ST>::alphabet);
00147 
00148         uint64_t buffer_size=granularity;
00149         CStringFeatures<ST>::features=SG_MALLOC(SGString<ST>, buffer_size);
00150 
00151         uint64_t offs=0;
00152         uint64_t len=0;
00153         CStringFeatures<ST>::max_string_length=0;
00154         CStringFeatures<ST>::num_vectors=0;
00155 
00156         while (true)
00157         {
00158             ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size);
00159 
00160             if (line)
00161             {
00162                 if (CStringFeatures<ST>::num_vectors>buffer_size)
00163                 {
00164                     CMath::resize(CStringFeatures<ST>::features, buffer_size, buffer_size+granularity);
00165                     buffer_size+=granularity;
00166                 }
00167 
00168                 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].string=line;
00169                 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].slen=len;
00170                 CStringFeatures<ST>::max_string_length=CMath::max(CStringFeatures<ST>::max_string_length, (int32_t) len);
00171             }
00172             else
00173                 break;
00174         }
00175 
00176         CStringFeatures<ST>::SG_INFO("number of strings:%d\n", CStringFeatures<ST>::num_vectors);
00177         CStringFeatures<ST>::SG_INFO("maximum string length:%d\n", CStringFeatures<ST>::max_string_length);
00178         CStringFeatures<ST>::SG_INFO("max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram());
00179         CStringFeatures<ST>::SG_INFO("num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram());
00180 
00181         if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet())
00182             CStringFileFeatures<ST>::cleanup();
00183 
00184         CMath::resize(CStringFeatures<ST>::features, buffer_size, CStringFeatures<ST>::num_vectors);
00185     }
00186 
00187 
00188     protected:
00190     CMemoryMappedFile<ST>* file;
00191 };
00192 }
00193 #endif // _CSTRINGFILEFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation