Go to the documentation of this file.00001 #include <shogun/features/StringFileFeatures.h>
00002
00003 namespace shogun
00004 {
00005
00006 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures() : CStringFeatures<ST>(), file(NULL)
00007 {
00008 }
00009
00010 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures(const char* fname, EAlphabet alpha)
00011 : CStringFeatures<ST>(alpha)
00012 {
00013 file = new CMemoryMappedFile<ST>(fname);
00014 fetch_meta_info_from_file();
00015 }
00016
00017 template <class ST> CStringFileFeatures<ST>::~CStringFileFeatures()
00018 {
00019 SG_UNREF(file);
00020 CStringFileFeatures<ST>::cleanup();
00021 }
00022
00023 template <class ST> ST* CStringFileFeatures<ST>::get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
00024 {
00025 ST* s = file->get_map();
00026 for (uint64_t i=offs; i<file_length; i++)
00027 {
00028 ST c=s[i];
00029
00030 if (c == '\n')
00031 {
00032 ST* line=&s[offs];
00033 len=i-offs;
00034 offs=i+1;
00035 line_nr++;
00036 return line;
00037 }
00038 else
00039 {
00040 if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c))
00041 {
00042 CStringFileFeatures<ST>::cleanup();
00043 CStringFeatures<ST>::SG_ERROR("Invalid character (%c) in line %d\n", c, line_nr);
00044 }
00045 }
00046 }
00047
00048 len=0;
00049 offs=file_length;
00050 return NULL;
00051 }
00052
00053 template <class ST> void CStringFileFeatures<ST>::cleanup()
00054 {
00055 CStringFeatures<ST>::num_vectors=0;
00056 SG_FREE(CStringFeatures<ST>::features);
00057 SG_FREE(CStringFeatures<ST>::symbol_mask_table);
00058 CStringFeatures<ST>::features=NULL;
00059 CStringFeatures<ST>::symbol_mask_table=NULL;
00060
00061
00062
00063
00064
00065 CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet());
00066 SG_UNREF(CStringFeatures<ST>::alphabet);
00067 CStringFeatures<ST>::alphabet=alpha;
00068 SG_REF(CStringFeatures<ST>::alphabet);
00069 }
00070
00071 template <class ST> void CStringFileFeatures<ST>::cleanup_feature_vector(int32_t num)
00072 {
00073 CStringFeatures<ST>::SG_ERROR("Cleaning single feature vector not"
00074 "supported by StringFileFeatures\n");
00075 }
00076
00077 template <class ST> void CStringFileFeatures<ST>::fetch_meta_info_from_file(int32_t granularity)
00078 {
00079 CStringFileFeatures<ST>::cleanup();
00080 uint64_t file_size=file->get_size();
00081 ASSERT(granularity>=1);
00082 ASSERT(CStringFeatures<ST>::alphabet);
00083
00084 int64_t buffer_size=granularity;
00085 CStringFeatures<ST>::features=SG_MALLOC(SGString<ST>, buffer_size);
00086
00087 uint64_t offs=0;
00088 uint64_t len=0;
00089 CStringFeatures<ST>::max_string_length=0;
00090 CStringFeatures<ST>::num_vectors=0;
00091
00092 while (true)
00093 {
00094 ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size);
00095
00096 if (line)
00097 {
00098 if (CStringFeatures<ST>::num_vectors > buffer_size)
00099 {
00100 CStringFeatures<ST>::features = SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, buffer_size+granularity);
00101 buffer_size+=granularity;
00102 }
00103
00104 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].string=line;
00105 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].slen=len;
00106 CStringFeatures<ST>::max_string_length=CMath::max(CStringFeatures<ST>::max_string_length, (int32_t) len);
00107 }
00108 else
00109 break;
00110 }
00111
00112 CStringFeatures<ST>::SG_INFO("number of strings:%d\n", CStringFeatures<ST>::num_vectors);
00113 CStringFeatures<ST>::SG_INFO("maximum string length:%d\n", CStringFeatures<ST>::max_string_length);
00114 CStringFeatures<ST>::SG_INFO("max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram());
00115 CStringFeatures<ST>::SG_INFO("num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram());
00116
00117 if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet())
00118 CStringFileFeatures<ST>::cleanup();
00119
00120 CStringFeatures<ST>::features=SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, CStringFeatures<ST>::num_vectors);
00121 }
00122
00123 template class CStringFileFeatures<bool>;
00124 template class CStringFileFeatures<char>;
00125 template class CStringFileFeatures<int8_t>;
00126 template class CStringFileFeatures<uint8_t>;
00127 template class CStringFileFeatures<int16_t>;
00128 template class CStringFileFeatures<uint16_t>;
00129 template class CStringFileFeatures<int32_t>;
00130 template class CStringFileFeatures<uint32_t>;
00131 template class CStringFileFeatures<int64_t>;
00132 template class CStringFileFeatures<uint64_t>;
00133 template class CStringFileFeatures<float32_t>;
00134 template class CStringFileFeatures<float64_t>;
00135 template class CStringFileFeatures<floatmax_t>;
00136 }