SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DelimiterTokenizer.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
12 #include <shogun/base/SGObject.h>
13 #include <shogun/base/Parameter.h>
14 #include <shogun/lib/SGVector.h>
15 
16 #include <string.h>
17 
18 namespace shogun
19 {
20 
21 CDelimiterTokenizer::CDelimiterTokenizer(bool skip_delimiters) : delimiters(256)
22 {
23  last_idx = 0;
24  skip_consecutive_delimiters = skip_delimiters;
25  init();
26 }
27 
29 {
31  delimiters = orig.delimiters;
32  init();
33 }
34 
35 void CDelimiterTokenizer::init()
36 {
37  SG_ADD(&last_idx, "last_idx", "Index of last token",
39  SG_ADD(&skip_consecutive_delimiters, "skip_consecutive_delimiters",
40  "Whether to skip consecutive delimiters or not", MS_NOT_AVAILABLE);
42 }
43 
45 {
46  last_idx = 0;
48 }
49 
50 const char* CDelimiterTokenizer::get_name() const
51 {
52  return "DelimiterTokenizer";
53 }
54 
56 {
58  {
59  for (index_t i=last_idx; i<text.size(); i++)
60  {
61  if (! delimiters[(uint8_t) text[i]])
62  return true;
63  }
64  return false;
65  }
66  else
67  return last_idx<text.size();
68 }
69 
71 {
73  delimiters[' '] = 1;
74  delimiters['\t'] = 1;
75 }
76 
78 {
79  memset(delimiters, 0, sizeof (delimiters));
80 }
81 
83 {
84  start = last_idx;
85 
87  {
88  while(delimiters[(uint8_t) text[start]])
89  start++;
90  }
91 
92  if (! delimiters[(uint8_t) text[start]])
93  {
94  for (last_idx=start+1; last_idx<text.size(); last_idx++)
95  {
96  if (delimiters[(uint8_t) text[last_idx]])
97  break;
98  }
99  }
100 
101  return last_idx++;
102 }
103 
105 {
107  t->delimiters = delimiters;
109  return t;
110 }
111 
113 {
114  skip_consecutive_delimiters = skip_delimiters;
115 }
116 
118 {
120 }
121 }

SHOGUN Machine Learning Toolbox - Documentation