StringFileFeatures.cpp

Go to the documentation of this file.
00001 #include <shogun/features/StringFileFeatures.h>
00002 
00003 namespace shogun
00004 {
00005 
00006 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures() : CStringFeatures<ST>(), file(NULL)
00007 {
00008 }
00009 
00010 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures(const char* fname, EAlphabet alpha)
00011 : CStringFeatures<ST>(alpha)
00012 {
00013     file = new CMemoryMappedFile<ST>(fname);
00014     fetch_meta_info_from_file();
00015 }
00016 
00017 template <class ST> CStringFileFeatures<ST>::~CStringFileFeatures()
00018 {
00019     SG_UNREF(file);
00020     CStringFileFeatures<ST>::cleanup();
00021 }
00022 
00023 template <class ST> ST* CStringFileFeatures<ST>::get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
00024 {
00025     ST* s = file->get_map();
00026     for (uint64_t i=offs; i<file_length; i++)
00027     {
00028         ST c=s[i];
00029 
00030         if (c == '\n')
00031         {
00032             ST* line=&s[offs];
00033             len=i-offs;
00034             offs=i+1;
00035             line_nr++;
00036             return line;
00037         }
00038         else
00039         {
00040             if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c))
00041             {
00042                 CStringFileFeatures<ST>::cleanup();
00043                 CStringFeatures<ST>::SG_ERROR("Invalid character (%c) in line %d\n", c, line_nr);
00044             }
00045         }
00046     }
00047 
00048     len=0;
00049     offs=file_length;
00050     return NULL;
00051 }
00052 
00053 template <class ST> void CStringFileFeatures<ST>::cleanup()
00054 {
00055     CStringFeatures<ST>::num_vectors=0;
00056     SG_FREE(CStringFeatures<ST>::features);
00057     SG_FREE(CStringFeatures<ST>::symbol_mask_table);
00058     CStringFeatures<ST>::features=NULL;
00059     CStringFeatures<ST>::symbol_mask_table=NULL;
00060 
00061     /* start with a fresh alphabet, but instead of emptying the histogram
00062      * create a new object (to leave the alphabet object alone if it is used
00063      * by others)
00064      */
00065     CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet());
00066     SG_UNREF(CStringFeatures<ST>::alphabet);
00067     CStringFeatures<ST>::alphabet=alpha;
00068     SG_REF(CStringFeatures<ST>::alphabet);
00069 }
00070 
00071 template <class ST> void CStringFileFeatures<ST>::cleanup_feature_vector(int32_t num)
00072 {
00073     CStringFeatures<ST>::SG_ERROR("Cleaning single feature vector not"
00074             "supported by StringFileFeatures\n");
00075 }
00076 
00077 template <class ST> void CStringFileFeatures<ST>::fetch_meta_info_from_file(int32_t granularity)
00078 {
00079     CStringFileFeatures<ST>::cleanup();
00080     uint64_t file_size=file->get_size();
00081     ASSERT(granularity>=1);
00082     ASSERT(CStringFeatures<ST>::alphabet);
00083 
00084     int64_t buffer_size=granularity;
00085     CStringFeatures<ST>::features=SG_MALLOC(SGString<ST>, buffer_size);
00086 
00087     uint64_t offs=0;
00088     uint64_t len=0;
00089     CStringFeatures<ST>::max_string_length=0;
00090     CStringFeatures<ST>::num_vectors=0;
00091 
00092     while (true)
00093     {
00094         ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size);
00095 
00096         if (line)
00097         {
00098             if (CStringFeatures<ST>::num_vectors > buffer_size)
00099             {
00100                 CStringFeatures<ST>::features = SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, buffer_size+granularity);
00101                 buffer_size+=granularity;
00102             }
00103 
00104             CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].string=line;
00105             CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].slen=len;
00106             CStringFeatures<ST>::max_string_length=CMath::max(CStringFeatures<ST>::max_string_length, (int32_t) len);
00107         }
00108         else
00109             break;
00110     }
00111 
00112     CStringFeatures<ST>::SG_INFO("number of strings:%d\n", CStringFeatures<ST>::num_vectors);
00113     CStringFeatures<ST>::SG_INFO("maximum string length:%d\n", CStringFeatures<ST>::max_string_length);
00114     CStringFeatures<ST>::SG_INFO("max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram());
00115     CStringFeatures<ST>::SG_INFO("num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram());
00116 
00117     if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet())
00118         CStringFileFeatures<ST>::cleanup();
00119 
00120     CStringFeatures<ST>::features=SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, CStringFeatures<ST>::num_vectors);
00121 }
00122 
00123 template class CStringFileFeatures<bool>;
00124 template class CStringFileFeatures<char>;
00125 template class CStringFileFeatures<int8_t>;
00126 template class CStringFileFeatures<uint8_t>;
00127 template class CStringFileFeatures<int16_t>;
00128 template class CStringFileFeatures<uint16_t>;
00129 template class CStringFileFeatures<int32_t>;
00130 template class CStringFileFeatures<uint32_t>;
00131 template class CStringFileFeatures<int64_t>;
00132 template class CStringFileFeatures<uint64_t>;
00133 template class CStringFileFeatures<float32_t>;
00134 template class CStringFileFeatures<float64_t>;
00135 template class CStringFileFeatures<floatmax_t>;
00136 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation