SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
StringFileFeatures.cpp
Go to the documentation of this file.
2 
3 namespace shogun
4 {
5 
6 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures() : CStringFeatures<ST>(), file(NULL)
7 {
8 }
9 
10 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures(const char* fname, EAlphabet alpha)
11 : CStringFeatures<ST>(alpha)
12 {
13  file = new CMemoryMappedFile<ST>(fname);
15 }
16 
18 {
19  SG_UNREF(file);
21 }
22 
23 template <class ST> ST* CStringFileFeatures<ST>::get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
24 {
25  ST* s = file->get_map();
26  for (uint64_t i=offs; i<file_length; i++)
27  {
28  ST c=s[i];
29 
30  if (c == '\n')
31  {
32  ST* line=&s[offs];
33  len=i-offs;
34  offs=i+1;
35  line_nr++;
36  return line;
37  }
38  else
39  {
40  if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c))
41  {
43  SG_CLASS_ERROR(CStringFeatures<ST>, "Invalid character (%c) in line %d\n", c, line_nr)
44  }
45  }
46  }
47 
48  len=0;
49  offs=file_length;
50  return NULL;
51 }
52 
53 template <class ST> void CStringFileFeatures<ST>::cleanup()
54 {
60 
61  /* start with a fresh alphabet, but instead of emptying the histogram
62  * create a new object (to leave the alphabet object alone if it is used
63  * by others)
64  */
65  CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet());
69 }
70 
71 template <class ST> void CStringFileFeatures<ST>::cleanup_feature_vector(int32_t num)
72 {
73  SG_CLASS_ERROR(CStringFeatures<ST>, "Cleaning single feature vector not"
74  "supported by StringFileFeatures\n")
75 }
76 
77 template <class ST> void CStringFileFeatures<ST>::fetch_meta_info_from_file(int32_t granularity)
78 {
80  uint64_t file_size=file->get_size();
81  ASSERT(granularity>=1)
83 
84  int64_t buffer_size=granularity;
85  CStringFeatures<ST>::features=SG_MALLOC(SGString<ST>, buffer_size);
86 
87  uint64_t offs=0;
88  uint64_t len=0;
91 
92  while (true)
93  {
94  ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size);
95 
96  if (line)
97  {
98  if (CStringFeatures<ST>::num_vectors > buffer_size)
99  {
100  CStringFeatures<ST>::features = SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, buffer_size, buffer_size+granularity);
101  buffer_size+=granularity;
102  }
103 
107  }
108  else
109  break;
110  }
111 
114  SG_CLASS_INFO(CStringFeatures<ST>,"max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram())
115  SG_CLASS_INFO(CStringFeatures<ST>,"num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram())
116 
117  if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet())
119 
121 }
122 
123 template class CStringFileFeatures<bool>;
124 template class CStringFileFeatures<char>;
125 template class CStringFileFeatures<int8_t>;
126 template class CStringFileFeatures<uint8_t>;
127 template class CStringFileFeatures<int16_t>;
128 template class CStringFileFeatures<uint16_t>;
129 template class CStringFileFeatures<int32_t>;
130 template class CStringFileFeatures<uint32_t>;
131 template class CStringFileFeatures<int64_t>;
132 template class CStringFileFeatures<uint64_t>;
133 template class CStringFileFeatures<float32_t>;
134 template class CStringFileFeatures<float64_t>;
135 template class CStringFileFeatures<floatmax_t>;
136 }
#define SG_CLASS_INFO(c,...)
Definition: SGIO.h:123
Template class StringFeatures implements a list of strings.
Definition: WDSVMOcas.h:25
ST * get_line(uint64_t &len, uint64_t &offs, int32_t &line_nr, uint64_t file_length)
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:23
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
CMemoryMappedFile< ST > * file
#define SG_REF(x)
Definition: SGObject.h:51
#define SG_CLASS_ERROR(c,...)
Definition: SGIO.h:131
#define ASSERT(x)
Definition: SGIO.h:201
void fetch_meta_info_from_file(int32_t granularity=1048576)
static T max(T a, T b)
Definition: Math.h:168
#define SG_UNREF(x)
Definition: SGObject.h:52
File based string features.
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void cleanup_feature_vector(int32_t num)

SHOGUN Machine Learning Toolbox - Documentation