Nuspell
spellchecker
unicode.hxx
1 /* Copyright 2021 Dimitrij Mijoski
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 #ifndef NUSPELL_UNICODE_HXX
19 #define NUSPELL_UNICODE_HXX
20 #include <string>
21 #include <string_view>
22 #include <unicode/utf16.h>
23 #include <unicode/utf8.h>
24 
25 namespace nuspell {
26 inline namespace v5 {
27 
28 // UTF-8, work on malformed
29 
30 inline constexpr auto u8_max_cp_length = U8_MAX_LENGTH;
31 
32 auto inline u8_is_cp_error(int32_t cp) -> bool { return cp < 0; }
33 
34 template <class Range>
35 auto u8_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
36 {
37  using std::size, std::data;
38 #if U_ICU_VERSION_MAJOR_NUM <= 60
39  auto s_ptr = data(str);
40  int32_t idx = i;
41  int32_t len = size(str);
42  U8_NEXT(s_ptr, idx, len, cp);
43  i = idx;
44 #else
45  auto len = size(str);
46  U8_NEXT(str, i, len, cp);
47 #endif
48 }
49 
50 template <class Range>
51 auto u8_advance_index(const Range& str, size_t& i) -> void
52 {
53  using std::size;
54  auto len = size(str);
55  U8_FWD_1(str, i, len);
56 }
57 
58 template <class Range>
59 auto u8_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
60 {
61  using std::size, std::data;
62  auto ptr = data(str);
63  int32_t idx = i;
64  U8_PREV(ptr, 0, idx, cp);
65  i = idx;
66 }
67 
68 template <class Range>
69 auto u8_reverse_index(const Range& str, size_t& i) -> void
70 {
71  using std::size, std::data;
72  auto ptr = data(str);
73  int32_t idx = i;
74  U8_BACK_1(ptr, 0, idx);
75  i = idx;
76 }
77 
78 template <class Range>
79 auto u8_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
80  -> void
81 {
82  using std::size, std::data;
83 #if U_ICU_VERSION_MAJOR_NUM <= 60
84  auto ptr = data(buf);
85  int32_t idx = i;
86  int32_t len = size(buf);
87  U8_APPEND(buf, idx, len, cp, error);
88  i = idx;
89 #else
90  auto len = size(buf);
91  U8_APPEND(buf, i, len, cp, error);
92 #endif
93 }
94 
95 // UTF-8, valid
96 
97 template <class Range>
98 auto valid_u8_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
99 {
100  U8_NEXT_UNSAFE(str, i, cp);
101 }
102 
103 template <class Range>
104 auto valid_u8_advance_index(const Range& str, size_t& i) -> void
105 {
106  U8_FWD_1_UNSAFE(str, i);
107 }
108 
109 template <class Range>
110 auto valid_u8_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
111 {
112  U8_PREV_UNSAFE(str, i, cp);
113 }
114 
115 template <class Range>
116 auto valid_u8_reverse_index(const Range& str, size_t& i) -> void
117 {
118  U8_BACK_1_UNSAFE(str, i);
119 }
120 
121 template <class Range>
122 auto valid_u8_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
123 {
124  U8_APPEND_UNSAFE(buf, i, cp);
125 }
126 
127 // UTF-16, work on malformed
128 
129 inline constexpr auto u16_max_cp_length = U16_MAX_LENGTH;
130 
131 auto inline u16_is_cp_error(int32_t cp) -> bool { return U_IS_SURROGATE(cp); }
132 
133 template <class Range>
134 auto u16_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
135 {
136  using std::size;
137  auto len = size(str);
138  U16_NEXT(str, i, len, cp);
139 }
140 
141 template <class Range>
142 auto u16_advance_index(const Range& str, size_t& i) -> void
143 {
144  using std::size;
145  auto len = size(str);
146  U16_FWD_1(str, i, len);
147 }
148 
149 template <class Range>
150 auto u16_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
151 {
152  U16_PREV(str, 0, i, cp);
153 }
154 
155 template <class Range>
156 auto u16_reverse_index(const Range& str, size_t& i) -> void
157 {
158  U16_BACK_1(str, 0, i);
159 }
160 
161 template <class Range>
162 auto u16_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
163  -> void
164 {
165  using std::size;
166  auto len = size(buf);
167  U16_APPEND(buf, i, len, cp, error);
168 }
169 
170 // UTF-16, valid
171 
172 template <class Range>
173 auto valid_u16_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
174 {
175  U16_NEXT_UNSAFE(str, i, cp);
176 }
177 
178 template <class Range>
179 auto valid_u16_advance_index(const Range& str, size_t& i) -> void
180 {
181  U16_FWD_1_UNSAFE(str, i);
182 }
183 
184 template <class Range>
185 auto valid_u16_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
186 {
187  U16_PREV_UNSAFE(str, i, cp);
188 }
189 
190 template <class Range>
191 auto valid_u16_reverse_index(const Range& str, size_t& i) -> void
192 {
193  U16_BACK_1_UNSAFE(str, i);
194 }
195 
196 template <class Range>
197 auto valid_u16_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
198 {
199  U16_APPEND_UNSAFE(buf, i, cp);
200 }
201 
202 // higer level funcs
203 
204 struct U8_CP_Pos {
205  size_t begin_i = 0;
206  size_t end_i = begin_i;
207 };
208 
210  char d[u8_max_cp_length];
211  int sz;
212 
213  public:
214  explicit U8_Encoded_CP(std::string_view str, U8_CP_Pos pos)
215  : sz(pos.end_i - pos.begin_i)
216  {
217  auto i = sz;
218  auto j = pos.end_i;
219  auto max_len = 4;
220  do {
221  d[--i] = str[--j];
222  } while (i && --max_len);
223  }
224  U8_Encoded_CP(char32_t cp)
225  {
226  size_t z = 0;
227  valid_u8_write_cp_and_advance(d, z, cp);
228  sz = z;
229  }
230  auto size() const noexcept -> size_t { return sz; }
231  auto data() const noexcept -> const char* { return d; }
232  operator std::string_view() const noexcept
233  {
234  return std::string_view(data(), size());
235  }
236  auto copy_to(std::string& str, size_t j) const
237  {
238  auto i = sz;
239  j += sz;
240  auto max_len = 4;
241  do {
242  str[--j] = d[--i];
243  } while (i && --max_len);
244  }
245 };
246 
247 auto inline u8_swap_adjacent_cp(std::string& str, size_t i1, size_t i2,
248  size_t i3) -> size_t
249 {
250  auto cp1 = U8_Encoded_CP(str, {i1, i2});
251  auto cp2 = U8_Encoded_CP(str, {i2, i3});
252  auto new_i2 = i1 + std::size(cp2);
253  cp1.copy_to(str, new_i2);
254  cp2.copy_to(str, i1);
255  return new_i2;
256 }
257 
258 auto inline u8_swap_cp(std::string& str, U8_CP_Pos pos1, U8_CP_Pos pos2)
259  -> std::pair<size_t, size_t>
260 {
261  using std::size;
262  auto cp1 = U8_Encoded_CP(str, pos1);
263  auto cp2 = U8_Encoded_CP(str, pos2);
264  auto new_p1_end_i = pos1.begin_i + size(cp2);
265  auto new_p2_begin_i = pos2.end_i - size(cp1);
266  std::char_traits<char>::move(&str[new_p1_end_i], &str[pos1.end_i],
267  pos2.begin_i - pos1.end_i);
268  cp2.copy_to(str, pos1.begin_i);
269  cp1.copy_to(str, new_p2_begin_i);
270  return {new_p1_end_i, new_p2_begin_i};
271 }
272 
273 // bellow go func without out-parametars
274 
275 // UTF-8, can be malformed, no out-parametars
276 
278  size_t end_i;
279  int32_t cp;
280 };
281 
283  size_t begin_i;
284  int32_t cp;
285 };
286 
288  size_t end_i;
289  bool error;
290 };
291 
292 template <class Range>
293 [[nodiscard]] auto u8_next_cp(const Range& str, size_t i) -> Idx_And_Next_CP
294 {
295  int32_t cp;
296  u8_advance_cp(str, i, cp);
297  return {i, cp};
298 }
299 
300 template <class Range>
301 [[nodiscard]] auto u8_next_index(const Range& str, size_t i) -> size_t
302 {
303  u8_advance_index(str, i);
304  return i;
305 }
306 
307 template <class Range>
308 [[nodiscard]] auto u8_prev_cp(const Range& str, size_t i) -> Idx_And_Prev_CP
309 {
310  int32_t cp;
311  u8_reverse_cp(str, i, cp);
312  return {i, cp};
313 }
314 
315 template <class Range>
316 [[nodiscard]] auto u8_prev_index(const Range& str, size_t i) -> size_t
317 {
318  u8_reverse_index(str, i);
319  return i;
320 }
321 
322 template <class Range>
323 [[nodiscard]] auto u8_write_cp(Range& buf, size_t i, int32_t cp)
324  -> Write_CP_Idx_and_Error
325 {
326  bool err;
327  u8_write_cp_and_advance(buf, i, cp, err);
328  return {i, err};
329 }
330 
331 // UTF-8, valid, no out-parametars
332 
334  size_t end_i;
335  char32_t cp;
336 };
337 
339  size_t begin_i;
340  char32_t cp;
341 };
342 
343 template <class Range>
344 [[nodiscard]] auto valid_u8_next_cp(const Range& str, size_t i)
346 {
347  char32_t cp;
348  valid_u8_advance_cp(str, i, cp);
349  return {i, cp};
350 }
351 
352 template <class Range>
353 [[nodiscard]] auto valid_u8_next_index(const Range& str, size_t i) -> size_t
354 {
355  valid_u8_advance_index(str, i);
356  return i;
357 }
358 
359 template <class Range>
360 [[nodiscard]] auto valid_u8_prev_cp(const Range& str, size_t i)
361  -> Idx_And_Prev_CP_Valid
362 {
363  char32_t cp;
364  valid_u8_reverse_cp(str, i, cp);
365  return {i, cp};
366 }
367 
368 template <class Range>
369 [[nodiscard]] auto valid_u8_prev_index(const Range& str, size_t i) -> size_t
370 {
371  valid_u8_reverse_index(str, i);
372  return i;
373 }
374 
375 template <class Range>
376 [[nodiscard]] auto valid_u8_write_cp(Range& buf, size_t i, int32_t cp) -> size_t
377 {
378  valid_u8_write_cp_and_advance(buf, i, cp);
379  return i;
380 }
381 } // namespace v5
382 } // namespace nuspell
383 #endif // NUSPELL_UNICODE_HXX
nuspell
Library main namespace.
Definition: aff_data.cxx:31
nuspell::v5::Idx_And_Next_CP_Valid
Definition: unicode.hxx:333
nuspell::v5::Write_CP_Idx_and_Error
Definition: unicode.hxx:287
nuspell::v5::U8_CP_Pos
Definition: unicode.hxx:204
nuspell::v5::U8_Encoded_CP
Definition: unicode.hxx:209
nuspell::v5::Idx_And_Prev_CP
Definition: unicode.hxx:282
nuspell::v5::Idx_And_Prev_CP_Valid
Definition: unicode.hxx:338
nuspell::v5::Idx_And_Next_CP
Definition: unicode.hxx:277