Nuspell
spellchecker
utils.hxx
Go to the documentation of this file.
1 /* Copyright 2016-2019 Dimitrij Mijoski
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
24 #ifndef NUSPELL_UTILS_HXX
25 #define NUSPELL_UTILS_HXX
26 
27 #include "structures.hxx"
28 
29 #include <locale>
30 #include <clocale>
31 
32 #if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || \
33  (defined(__APPLE__) && defined(__MACH__)))
34 #include <unistd.h>
35 #endif
36 
37 #include <unicode/locid.h>
38 
39 struct UConverter; // unicode/ucnv.h
40 
41 namespace nuspell {
42 
43 auto wide_to_utf8(const std::wstring& in, std::string& out) -> void;
44 auto wide_to_utf8(const std::wstring& in) -> std::string;
45 
46 auto utf8_to_wide(const std::string& in, std::wstring& out) -> bool;
47 auto utf8_to_wide(const std::string& in) -> std::wstring;
48 
49 auto utf8_to_16(const std::string& in) -> std::u16string;
50 auto utf8_to_16(const std::string& in, std::u16string& out) -> bool;
51 
52 auto is_ascii(char c) -> bool;
53 auto is_all_ascii(const std::string& s) -> bool;
54 
55 auto latin1_to_ucs2(const std::string& s) -> std::u16string;
56 auto latin1_to_ucs2(const std::string& s, std::u16string& out) -> void;
57 
58 auto is_all_bmp(const std::u16string& s) -> bool;
59 
60 auto to_wide(const std::string& in, const std::locale& inloc, std::wstring& out)
61  -> bool;
62 auto to_wide(const std::string& in, const std::locale& inloc) -> std::wstring;
63 auto to_narrow(const std::wstring& in, std::string& out,
64  const std::locale& outloc) -> bool;
65 auto to_narrow(const std::wstring& in, const std::locale& outloc)
66  -> std::string;
67 
68 auto to_upper_ascii(std::string& s) -> void;
69 
70 auto is_locale_known_utf8(const std::locale& loc) -> bool;
71 
72 auto wide_to_icu(const std::wstring& in, icu::UnicodeString& out) -> bool;
73 auto icu_to_wide(const icu::UnicodeString& in, std::wstring& out) -> bool;
74 
75 auto to_upper(const std::wstring& in, const icu::Locale& loc) -> std::wstring;
76 auto to_title(const std::wstring& in, const icu::Locale& loc) -> std::wstring;
77 auto to_lower(const std::wstring& in, const icu::Locale& loc) -> std::wstring;
78 
79 auto classify_casing(const std::wstring& s) -> Casing;
80 
81 auto has_uppercase_at_compound_word_boundary(const std::wstring& word, size_t i)
82  -> bool;
83 
85  UConverter* cnv = nullptr;
86 
87  public:
88  Encoding_Converter() = default;
89  explicit Encoding_Converter(const char* enc);
90  explicit Encoding_Converter(const std::string& enc)
91  : Encoding_Converter(enc.c_str())
92  {
93  }
96  Encoding_Converter(Encoding_Converter&& other) noexcept
97  {
98  cnv = other.cnv;
99  cnv = nullptr;
100  }
101  auto operator=(const Encoding_Converter& other) -> Encoding_Converter&;
102  auto operator=(Encoding_Converter&& other) noexcept
104  {
105  std::swap(cnv, other.cnv);
106  return *this;
107  }
108  auto to_wide(const std::string& in, std::wstring& out) -> bool;
109  auto to_wide(const std::string& in) -> std::wstring;
110  auto valid() -> bool { return cnv != nullptr; }
111 };
112 
113 //#if _POSIX_VERSION >= 200809L
114 #ifdef _POSIX_VERSION
116  locale_t old_loc = nullptr;
117 
118  public:
120  : old_loc{uselocale(newlocale(0, "C", nullptr))}
121  {
122  }
124  {
125  auto new_loc = uselocale(old_loc);
126  if (new_loc != old_loc)
127  freelocale(new_loc);
128  }
130 };
131 #else
133  std::string old_name;
134 #ifdef _WIN32
135  int old_per_thread;
136 #endif
137  public:
138  Setlocale_To_C_In_Scope() : old_name(setlocale(LC_ALL, nullptr))
139  {
140 #ifdef _WIN32
141  old_per_thread = _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
142 #endif
143  auto x = setlocale(LC_ALL, "C");
144  if (!x)
145  old_name.clear();
146  }
148  {
149 #ifdef _WIN32
150  _configthreadlocale(old_per_thread);
151  if (old_per_thread == _ENABLE_PER_THREAD_LOCALE)
152 #endif
153  {
154  if (!old_name.empty())
155  setlocale(LC_ALL, old_name.c_str());
156  }
157  }
159 };
160 #endif
161 
173 template <class CharT, class SepT, class OutIt>
174 auto split_on_any_of(const std::basic_string<CharT>& s, const SepT& sep,
175  OutIt out)
176 {
177  using size_type = typename std::basic_string<CharT>::size_type;
178  size_type i1 = 0;
179  size_type i2;
180  do {
181  i2 = s.find_first_of(sep, i1);
182  *out++ = s.substr(i1, i2 - i1);
183  i1 = i2 + 1; // we can only add +1 if sep is single char.
184 
185  // i2 gets s.npos after the last separator.
186  // Length of i2-i1 will always go past the end. That is defined.
187  } while (i2 != s.npos);
188  return out;
189 }
190 
202 template <class CharT, class OutIt>
203 auto split(const std::basic_string<CharT>& s, CharT sep, OutIt out)
204 {
205  return split_on_any_of(s, sep, out);
206 }
207 
208 template <class CharT>
209 auto& erase_chars(std::basic_string<CharT>& s,
210  const std::basic_string<CharT>& erase_chars)
211 {
212  if (erase_chars.empty())
213  return s;
214  auto is_erasable = [&](CharT c) {
215  return erase_chars.find(c) != erase_chars.npos;
216  };
217  auto it = remove_if(begin(s), end(s), is_erasable);
218  s.erase(it, end(s));
219  return s;
220 }
221 
222 template <class CharT>
223 auto& replace_char(std::basic_string<CharT>& s, CharT from, CharT to)
224 {
225  for (auto i = s.find(from); i != s.npos; i = s.find(from, i + 1)) {
226  s[i] = to;
227  }
228  return s;
229 }
230 
238 template <class CharT>
239 auto is_number(const std::basic_string<CharT>& s) -> bool
240 {
241  if (s.empty())
242  return false;
243 
244  auto it = begin(s);
245  if (s[0] == '-')
246  ++it;
247  while (it != end(s)) {
248  auto next = find_if(it, end(s),
249  [](auto c) { return c < '0' || c > '9'; });
250  if (next == it)
251  return false;
252  if (next == end(s))
253  return true;
254  it = next;
255  auto c = *it;
256  if (c == '.' || c == ',' || c == '-')
257  ++it;
258  else
259  return false;
260  }
261  return false;
262 }
263 
264 auto count_appereances_of(const std::wstring& haystack,
265  const std::wstring& needles) -> size_t;
266 
267 } // namespace nuspell
268 #endif // NUSPELL_UTILS_HXX
Definition: utils.hxx:84
auto has_uppercase_at_compound_word_boundary(const std::wstring &word, size_t i) -> bool
Check if word[i] or word[i-1] are uppercase.
Definition: utils.cxx:447
auto classify_casing(const std::wstring &s) -> Casing
Determines casing (capitalization) type for a word.
Definition: utils.cxx:404
Data structures, private header.
Library main namespace.
Definition: aff_data.cxx:67
Casing
Casing type enum, ignoring neutral case characters.
Definition: structures.hxx:432
Definition: utils.hxx:132
auto split(const std::basic_string< CharT > &s, CharT sep, OutIt out)
Splits string on single char seperator.
Definition: utils.hxx:203
auto split_on_any_of(const std::basic_string< CharT > &s, const SepT &sep, OutIt out)
Splits string on set of single char seperators.
Definition: utils.hxx:174
auto is_number(const std::basic_string< CharT > &s) -> bool
Tests if word is a number.
Definition: utils.hxx:239