Nuspell
spellchecker
locale_utils.hxx
Go to the documentation of this file.
1 /* Copyright 2016-2018 Dimitrij Mijoski
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
24 #ifndef NUSPELL_LOCALE_UTILS_HXX
25 #define NUSPELL_LOCALE_UTILS_HXX
26 
27 #include <locale>
28 #include <string>
29 
30 #include <boost/container/small_vector.hpp>
31 #include <unicode/locid.h>
32 
33 struct UConverter; // unicode/ucnv.h
34 
35 namespace nuspell {
36 
37 auto validate_utf8(const std::string& s) -> bool;
38 
39 auto wide_to_utf8(const std::wstring& in, std::string& out) -> void;
40 auto wide_to_utf8(const std::wstring& in) -> std::string;
41 auto wide_to_utf8(const std::wstring& in,
42  boost::container::small_vector_base<char>& out) -> void;
43 
44 auto utf8_to_wide(const std::string& in, std::wstring& out) -> bool;
45 auto utf8_to_wide(const std::string& in) -> std::wstring;
46 
47 auto utf8_to_16(const std::string& in) -> std::u16string;
48 auto utf8_to_16(const std::string& in, std::u16string& out) -> bool;
49 
50 auto is_ascii(char c) -> bool;
51 auto is_all_ascii(const std::string& s) -> bool;
52 
53 auto latin1_to_ucs2(const std::string& s) -> std::u16string;
54 auto latin1_to_ucs2(const std::string& s, std::u16string& out) -> void;
55 
56 auto is_all_bmp(const std::u16string& s) -> bool;
57 
58 auto to_wide(const std::string& in, const std::locale& inloc, std::wstring& out)
59  -> bool;
60 auto to_wide(const std::string& in, const std::locale& inloc) -> std::wstring;
61 auto to_narrow(const std::wstring& in, std::string& out,
62  const std::locale& outloc) -> bool;
63 auto to_narrow(const std::wstring& in, const std::locale& outloc)
64  -> std::string;
65 
66 auto is_locale_known_utf8(const std::locale& loc) -> bool;
67 
68 auto wide_to_icu(const std::wstring& in, icu::UnicodeString& out) -> bool;
69 auto icu_to_wide(const icu::UnicodeString& in, std::wstring& out) -> bool;
70 
71 auto to_upper(const std::wstring& in, const icu::Locale& loc) -> std::wstring;
72 auto to_title(const std::wstring& in, const icu::Locale& loc) -> std::wstring;
73 auto to_lower(const std::wstring& in, const icu::Locale& loc) -> std::wstring;
74 
78 enum class Casing {
79  SMALL ,
80  INIT_CAPITAL ,
81  ALL_CAPITAL ,
82  CAMEL ,
83  PASCAL
84 };
85 
86 auto classify_casing(const std::wstring& s) -> Casing;
87 
88 auto has_uppercase_at_compound_word_boundary(const std::wstring& word, size_t i)
89  -> bool;
90 
91 class Encoding {
92  std::string name;
93 
94  auto normalize_name() -> void;
95 
96  public:
97  enum Enc_Type { SINGLEBYTE = false, UTF8 = true };
98 
99  Encoding() = default;
100  Encoding(const std::string& e) : name(e) { normalize_name(); }
101  Encoding(std::string&& e) : name(move(e)) { normalize_name(); }
102  Encoding(const char* e) : name(e) { normalize_name(); }
103  auto& operator=(const std::string& e)
104  {
105  name = e;
106  normalize_name();
107  return *this;
108  }
109  auto& operator=(std::string&& e)
110  {
111  name = move(e);
112  normalize_name();
113  return *this;
114  }
115  auto& operator=(const char* e)
116  {
117  name = e;
118  normalize_name();
119  return *this;
120  }
121  auto empty() const { return name.empty(); }
122  operator const std::string&() const { return name; }
123  auto& value() const { return name; }
124  auto is_utf8() const { return name == "UTF-8"; }
125  auto value_or_default() -> std::string
126  {
127  if (name.empty())
128  return "ISO8859-1";
129  else
130  return name;
131  }
132  operator Enc_Type() const { return is_utf8() ? UTF8 : SINGLEBYTE; }
133 };
134 
136  UConverter* cnv = nullptr;
137 
138  public:
139  Encoding_Converter() = default;
140  Encoding_Converter(const char* enc);
141  Encoding_Converter(const std::string& enc)
142  : Encoding_Converter(enc.c_str())
143  {
144  }
148  {
149  cnv = other.cnv;
150  cnv = nullptr;
151  };
152  auto operator=(const Encoding_Converter& other) -> Encoding_Converter&;
153  auto operator=(Encoding_Converter&& other) -> Encoding_Converter&
154  {
155  std::swap(cnv, other.cnv);
156  return *this;
157  }
158  auto to_wide(const std::string& in, std::wstring& out) -> bool;
159  auto to_wide(const std::string& in) -> std::wstring;
160 };
161 } // namespace nuspell
162 #endif // NUSPELL_LOCALE_UTILS_HXX
Definition: locale_utils.hxx:135
Definition: locale_utils.hxx:91
auto has_uppercase_at_compound_word_boundary(const std::wstring &word, size_t i) -> bool
Check if word[i] or word[i-1] are uppercase.
Definition: locale_utils.cxx:456
Casing
Casing type enum, ignoring neutral case characters.
Definition: locale_utils.hxx:78
start upper case, rest lower case, e.g.
all lower case or neutral case, e.g.
auto classify_casing(const std::wstring &s) -> Casing
Determines casing (capitalization) type for a word.
Definition: locale_utils.cxx:413
camel case, start lower case, e.g.
pascal case, start upper case, e.g.
UTF-8 flag, e.g.
Library main namespace.
Definition: aff_data.cxx:78
all upper case, e.g.