Nuspell
spellchecker
string_utils.hxx
Go to the documentation of this file.
1 /* Copyright 2018 Dimitrij Mijoski, Sander van Geloven
2  * Copyright 2016-2017 Dimitrij Mijoski
3  *
4  * This file is part of Nuspell.
5  *
6  * Nuspell is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Nuspell is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
18  */
19 
25 #ifndef NUSPELL_STRING_UTILS_HXX
26 #define NUSPELL_STRING_UTILS_HXX
27 
28 #include <algorithm>
29 #include <iterator>
30 #include <locale>
31 #include <stack>
32 #include <string>
33 #include <vector>
34 
35 namespace nuspell {
36 #define LITERAL(T, x) ::nuspell::literal_choose<T>(x, L##x)
37 
38 template <class CharT>
39 auto constexpr literal_choose(const char* narrow, const wchar_t* wide);
40 template <>
41 auto constexpr literal_choose<char>(const char* narrow, const wchar_t*)
42 {
43  return narrow;
44 }
45 template <>
46 auto constexpr literal_choose<wchar_t>(const char*, const wchar_t* wide)
47 {
48  return wide;
49 }
50 
62 template <class CharT, class SepT, class OutIt>
63 auto split_on_any_of(const std::basic_string<CharT>& s, const SepT& sep,
64  OutIt out)
65 {
66  using size_type = typename std::basic_string<CharT>::size_type;
67  size_type i1 = 0;
68  size_type i2;
69  do {
70  i2 = s.find_first_of(sep, i1);
71  *out++ = s.substr(i1, i2 - i1);
72  i1 = i2 + 1; // we can only add +1 if sep is single char.
73 
74  // i2 gets s.npos after the last separator.
75  // Length of i2-i1 will always go past the end. That is defined.
76  } while (i2 != s.npos);
77  return out;
78 }
79 
91 template <class CharT, class OutIt>
92 auto split(const std::basic_string<CharT>& s, CharT sep, OutIt out)
93 {
94  return split_on_any_of(s, sep, out);
95 }
96 
106 template <class CharT, class OutIt>
107 auto split(const std::basic_string<CharT>& s,
108  const std::basic_string<CharT>& sep, OutIt out)
109 {
110  using size_type = typename std::basic_string<CharT>::size_type;
111  size_type i1 = 0;
112  size_type i2;
113  do {
114  i2 = s.find(sep, i1);
115  *out++ = s.substr(i1, i2 - i1);
116  i1 = i2 + sep.size();
117  } while (i2 != s.npos);
118  return out;
119 }
120 
130 template <class CharT, class OutIt>
131 auto split(const std::basic_string<CharT>& s, const CharT* sep, OutIt out)
132 {
133  return split(s, std::basic_string<CharT>(sep), out);
134 }
135 
145 template <class CharT, class CharOrStr>
146 auto split_v(const std::basic_string<CharT>& s, const CharOrStr& sep,
147  std::vector<std::basic_string<CharT>>& v)
148 {
149  v.clear();
150  split(s, sep, std::back_inserter(v));
151 }
152 
160 template <class CharT, class CharOrStr>
161 auto split_first(const std::basic_string<CharT>& s, const CharOrStr& sep)
162  -> std::basic_string<CharT>
163 {
164  auto index = s.find(sep);
165  return s.substr(0, index);
166 }
167 
180 template <class CharT, class OutIt>
181 auto split_on_whitespace(const std::basic_string<CharT>& s, OutIt out,
182  const std::locale& loc = std::locale()) -> OutIt
183 {
184  auto& f = std::use_facet<std::ctype<CharT>>(loc);
185  auto isspace = [&](auto& c) { return f.is(std::ctype_base::space, c); };
186  auto i1 = begin(s);
187  auto endd = end(s);
188  do {
189  auto i2 = std::find_if_not(i1, endd, isspace);
190  if (i2 == endd)
191  break;
192  auto i3 = std::find_if(i2, endd, isspace);
193  *out++ = std::basic_string<CharT>(i2, i3);
194  i1 = i3;
195  } while (i1 != endd);
196  return out;
197 }
198 
208 template <class CharT>
209 auto split_on_whitespace_v(const std::basic_string<CharT>& s,
210  std::vector<std::basic_string<CharT>>& v,
211  const std::locale& loc = std::locale()) -> void
212 {
213  v.clear();
214  split_on_whitespace(s, back_inserter(v), loc);
215 }
216 
217 template <class CharT>
218 auto& erase_chars(std::basic_string<CharT>& s,
219  const std::basic_string<CharT>& erase_chars)
220 {
221  if (erase_chars.empty())
222  return s;
223  auto is_erasable = [&](CharT c) {
224  return erase_chars.find(c) != erase_chars.npos;
225  };
226  auto it = remove_if(begin(s), end(s), is_erasable);
227  s.erase(it, end(s));
228  return s;
229 }
230 
231 template <class CharT>
232 auto& replace_char(std::basic_string<CharT>& s, CharT from, CharT to)
233 {
234  for (auto i = s.find(from); i != s.npos; i = s.find(from, i + 1)) {
235  s[i] = to;
236  }
237  return s;
238 }
239 
247 template <class CharT>
248 auto is_number(const std::basic_string<CharT>& s) -> bool
249 {
250  if (s.empty())
251  return false;
252 
253  auto it = begin(s);
254  if (s[0] == '-')
255  ++it;
256  while (it != end(s)) {
257  auto next = find_if(it, end(s),
258  [](auto c) { return c < '0' || c > '9'; });
259  if (next == it)
260  return false;
261  if (next == end(s))
262  return true;
263  it = next;
264  auto c = *it;
265  if (c == '.' || c == ',' || c == '-')
266  ++it;
267  else
268  return false;
269  }
270  return false;
271 }
272 
273 template <class DataIter, class PatternIter, class FuncEq = std::equal_to<>>
274 auto match_simple_regex(DataIter data_first, DataIter data_last,
275  PatternIter pat_first, PatternIter pat_last,
276  FuncEq eq = FuncEq())
277 {
278  auto s = std::stack<std::pair<DataIter, PatternIter>>();
279  s.emplace(data_first, pat_first);
280  auto data_it = DataIter();
281  auto pat_it = PatternIter();
282  while (!s.empty()) {
283  std::tie(data_it, pat_it) = s.top();
284  s.pop();
285  if (pat_it == pat_last) {
286  if (data_it == data_last)
287  return true;
288  else
289  return false;
290  }
291  auto node_type = *pat_it;
292  if (pat_it + 1 == pat_last)
293  node_type = 0;
294  else
295  node_type = *(pat_it + 1);
296  switch (node_type) {
297  case '?':
298  s.emplace(data_it, pat_it + 2);
299  if (data_it != data_last && eq(*data_it, *pat_it))
300  s.emplace(data_it + 1, pat_it + 2);
301  break;
302  case '*':
303  s.emplace(data_it, pat_it + 2);
304  if (data_it != data_last && eq(*data_it, *pat_it))
305  s.emplace(data_it + 1, pat_it);
306 
307  break;
308  default:
309  if (data_it != data_last && eq(*data_it, *pat_it))
310  s.emplace(data_it + 1, pat_it + 1);
311  break;
312  }
313  }
314  return false;
315 }
316 
317 template <class DataRange, class PatternRange, class FuncEq = std::equal_to<>>
318 auto match_simple_regex(const DataRange& data, const PatternRange& pattern,
319  FuncEq eq = FuncEq())
320 {
321  using namespace std;
322  return match_simple_regex(begin(data), end(data), begin(pattern),
323  end(pattern), eq);
324 }
325 
326 } // namespace nuspell
327 #endif // NUSPELL_STRING_UTILS_HXX
Definition: main.cxx:622
Library main namespace.
Definition: aff_data.cxx:74
auto split_on_whitespace_v(const std::basic_string< CharT > &s, std::vector< std::basic_string< CharT >> &v, const std::locale &loc=std::locale()) -> void
Splits on whitespace, outputs to vector of strings.
Definition: string_utils.hxx:209
auto split_v(const std::basic_string< CharT > &s, const CharOrStr &sep, std::vector< std::basic_string< CharT >> &v)
Splits string on seperator, output to vector of strings.
Definition: string_utils.hxx:146
auto split(const std::basic_string< CharT > &s, CharT sep, OutIt out)
Splits string on single char seperator.
Definition: string_utils.hxx:92
auto split_first(const std::basic_string< CharT > &s, const CharOrStr &sep) -> std::basic_string< CharT >
Gets the first token of a splitted string.
Definition: string_utils.hxx:161
auto split_on_whitespace(const std::basic_string< CharT > &s, OutIt out, const std::locale &loc=std::locale()) -> OutIt
Splits on whitespace.
Definition: string_utils.hxx:181
auto split_on_any_of(const std::basic_string< CharT > &s, const SepT &sep, OutIt out)
Splits string on set of single char seperators.
Definition: string_utils.hxx:63
auto is_number(const std::basic_string< CharT > &s) -> bool
Tests if word is a number.
Definition: string_utils.hxx:248