SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
DelimiterTokenizer.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
12 #include <shogun/base/SGObject.h>
13 #include <shogun/base/Parameter.h>
14 #include <shogun/lib/SGVector.h>
15 
16 #include <string.h>
17 
18 namespace shogun
19 {
20 
21 CDelimiterTokenizer::CDelimiterTokenizer(bool skip_delimiters) : delimiters(256)
22 {
23  last_idx = 0;
24  skip_consecutive_delimiters = skip_delimiters;
25  init();
26 }
27 
29 {
31  delimiters = orig.delimiters;
32  init();
33 }
34 
35 void CDelimiterTokenizer::init()
36 {
37  SG_ADD(&last_idx, "last_idx", "Index of last token",
39  SG_ADD(&skip_consecutive_delimiters, "skip_consecutive_delimiters",
40  "Whether to skip consecutive delimiters or not", MS_NOT_AVAILABLE);
42 }
43 
45 {
46  last_idx = 0;
48 }
49 
50 const char* CDelimiterTokenizer::get_name() const
51 {
52  return "DelimiterTokenizer";
53 }
54 
56 {
58  {
59  for (index_t i=last_idx; i<text.size(); i++)
60  {
61  if (! delimiters[(uint8_t) text[i]])
62  return true;
63  }
64  return false;
65  }
66  else
67  return last_idx<text.size();
68 }
69 
71 {
73  delimiters[' '] = 1;
74  delimiters['\t'] = 1;
75 }
76 
78 {
79  memset(delimiters, 0, sizeof (delimiters));
80 }
81 
83 {
84  start = last_idx;
85 
87  {
88  while(delimiters[(uint8_t) text[start]])
89  start++;
90  }
91 
92  if (! delimiters[(uint8_t) text[start]])
93  {
94  for (last_idx=start+1; last_idx<text.size(); last_idx++)
95  {
96  if (delimiters[(uint8_t) text[last_idx]])
97  break;
98  }
99  }
100 
101  return last_idx++;
102 }
103 
105 {
107  t->delimiters = delimiters;
109  return t;
110 }
111 
113 {
114  skip_consecutive_delimiters = skip_delimiters;
115 }
116 
118 {
120 }
121 }
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:221
int32_t index_t
Definition: common.h:62
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17
virtual index_t next_token_idx(index_t &start)
SGVector< char > text
Definition: Tokenizer.h:73
int32_t size() const
Definition: SGVector.h:113
void set_skip_delimiters(bool skip_delimiters)
CDelimiterTokenizer * get_copy()
virtual const char * get_name() const
CDelimiterTokenizer(bool skip_delimiters=false)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
virtual void set_text(SGVector< char > txt)
#define SG_ADD(...)
Definition: SGObject.h:84

SHOGUN Machine Learning Toolbox - Documentation