SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DelimiterTokenizer.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
11 #include <shogun/base/Parameter.h>
13 
14 namespace shogun
15 {
16 
17 CDelimiterTokenizer::CDelimiterTokenizer(bool skip_delimiters) : delimiters(256)
18 {
19  last_idx = 0;
20  skip_consecutive_delimiters = skip_delimiters;
21  init();
22 }
23 
25 {
27  delimiters = orig.delimiters;
28  init();
29 }
30 
31 void CDelimiterTokenizer::init()
32 {
33  SG_ADD(&last_idx, "last_idx", "Index of last token",
35  SG_ADD(&skip_consecutive_delimiters, "skip_consecutive_delimiters",
36  "Whether to skip consecutive delimiters or not", MS_NOT_AVAILABLE);
38 }
39 
41 {
42  last_idx = 0;
44 }
45 
46 const char* CDelimiterTokenizer::get_name() const
47 {
48  return "DelimiterTokenizer";
49 }
50 
52 {
54  {
55  for (index_t i=last_idx; i<text.size(); i++)
56  {
57  if (! delimiters[(uint8_t) text[i]])
58  return true;
59  }
60  return false;
61  }
62  else
63  return last_idx<text.size();
64 }
65 
67 {
69  delimiters[' '] = 1;
70  delimiters['\t'] = 1;
71 }
72 
74 {
75  memset(delimiters, 0, sizeof (delimiters));
76 }
77 
79 {
80  start = last_idx;
81 
83  {
84  while(delimiters[(uint8_t) text[start]])
85  start++;
86  }
87 
88  if (! delimiters[(uint8_t) text[start]])
89  {
90  for (last_idx=start+1; last_idx<text.size(); last_idx++)
91  {
92  if (delimiters[(uint8_t) text[last_idx]])
93  break;
94  }
95  }
96 
97  return last_idx++;
98 }
99 
101 {
103  t->delimiters = delimiters;
105  return t;
106 }
107 
109 {
110  skip_consecutive_delimiters = skip_delimiters;
111 }
112 
114 {
116 }
117 }

SHOGUN Machine Learning Toolbox - Documentation