Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CSTRINGFILEFEATURES__H__
00012 #define _CSTRINGFILEFEATURES__H__
00013
00014 #include <shogun/features/StringFeatures.h>
00015 #include <shogun/features/Alphabet.h>
00016 #include <shogun/io/MemoryMappedFile.h>
00017 #include <shogun/mathematics/Math.h>
00018 #include <shogun/io/SGIO.h>
00019
00020 namespace shogun
00021 {
00022 class CAlphabet;
00023 template <class T> class CMemoryMappedFile;
00024
00034 template <class ST> class CStringFileFeatures : public CStringFeatures<ST>
00035 {
00036 public:
00037
00041 CStringFileFeatures() : CStringFeatures<ST>(), file(NULL)
00042 {
00043 }
00044
00050 CStringFileFeatures(const char* fname, EAlphabet alpha)
00051 : CStringFeatures<ST>(alpha)
00052 {
00053 file = new CMemoryMappedFile<ST>(fname);
00054 fetch_meta_info_from_file();
00055 }
00056
00060 virtual ~CStringFileFeatures()
00061 {
00062 SG_UNREF(file);
00063 CStringFileFeatures<ST>::cleanup();
00064 }
00065
00066 protected:
00081 ST* get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
00082 {
00083 ST* s = file->get_map();
00084 for (uint64_t i=offs; i<file_length; i++)
00085 {
00086 ST c=s[i];
00087
00088 if (c == '\n')
00089 {
00090 ST* line=&s[offs];
00091 len=i-offs;
00092 offs=i+1;
00093 line_nr++;
00094 return line;
00095 }
00096 else
00097 {
00098 if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c))
00099 {
00100 CStringFileFeatures<ST>::cleanup();
00101 CStringFeatures<ST>::SG_ERROR("Invalid character (%c) in line %d\n", c, line_nr);
00102 }
00103 }
00104 }
00105
00106 len=0;
00107 offs=file_length;
00108 return NULL;
00109 }
00110
00112 virtual void cleanup()
00113 {
00114 CStringFeatures<ST>::num_vectors=0;
00115 SG_FREE(CStringFeatures<ST>::features);
00116 SG_FREE(CStringFeatures<ST>::symbol_mask_table);
00117 CStringFeatures<ST>::features=NULL;
00118 CStringFeatures<ST>::symbol_mask_table=NULL;
00119
00120
00121
00122
00123
00124 CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet());
00125 SG_UNREF(CStringFeatures<ST>::alphabet);
00126 CStringFeatures<ST>::alphabet=alpha;
00127 SG_REF(CStringFeatures<ST>::alphabet);
00128 }
00129
00131 virtual void cleanup_feature_vector(int32_t num)
00132 {
00133 CStringFeatures<ST>::SG_ERROR("Cleaning single feature vector not"
00134 "supported by StringFileFeatures\n");
00135 }
00136
00141 void fetch_meta_info_from_file(int32_t granularity=1048576)
00142 {
00143 CStringFileFeatures<ST>::cleanup();
00144 uint64_t file_size=file->get_size();
00145 ASSERT(granularity>=1);
00146 ASSERT(CStringFeatures<ST>::alphabet);
00147
00148 uint64_t buffer_size=granularity;
00149 CStringFeatures<ST>::features=SG_MALLOC(SGString<ST>, buffer_size);
00150
00151 uint64_t offs=0;
00152 uint64_t len=0;
00153 CStringFeatures<ST>::max_string_length=0;
00154 CStringFeatures<ST>::num_vectors=0;
00155
00156 while (true)
00157 {
00158 ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size);
00159
00160 if (line)
00161 {
00162 if (CStringFeatures<ST>::num_vectors>buffer_size)
00163 {
00164 CMath::resize(CStringFeatures<ST>::features, buffer_size, buffer_size+granularity);
00165 buffer_size+=granularity;
00166 }
00167
00168 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].string=line;
00169 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].slen=len;
00170 CStringFeatures<ST>::max_string_length=CMath::max(CStringFeatures<ST>::max_string_length, (int32_t) len);
00171 }
00172 else
00173 break;
00174 }
00175
00176 CStringFeatures<ST>::SG_INFO("number of strings:%d\n", CStringFeatures<ST>::num_vectors);
00177 CStringFeatures<ST>::SG_INFO("maximum string length:%d\n", CStringFeatures<ST>::max_string_length);
00178 CStringFeatures<ST>::SG_INFO("max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram());
00179 CStringFeatures<ST>::SG_INFO("num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram());
00180
00181 if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet())
00182 CStringFileFeatures<ST>::cleanup();
00183
00184 CMath::resize(CStringFeatures<ST>::features, buffer_size, CStringFeatures<ST>::num_vectors);
00185 }
00186
00187
00188 protected:
00190 CMemoryMappedFile<ST>* file;
00191 };
00192 }
00193 #endif // _CSTRINGFILEFEATURES__H__