Nuspell
spell checker
unicode.hxx
1 /* Copyright 2021-2023 Dimitrij Mijoski
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 #ifndef NUSPELL_UNICODE_HXX
19 #define NUSPELL_UNICODE_HXX
20 #include "defines.hxx"
21 
22 #include <string>
23 #include <string_view>
24 #include <unicode/utf16.h>
25 #include <unicode/utf8.h>
26 
27 namespace nuspell {
28 NUSPELL_BEGIN_INLINE_NAMESPACE
29 
30 // UTF-8, work on malformed
31 
32 inline constexpr auto u8_max_cp_length = U8_MAX_LENGTH;
33 
34 auto inline u8_is_cp_error(int32_t cp) -> bool { return cp < 0; }
35 
36 template <class Range>
37 auto u8_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
38 {
39  using std::size, std::data;
40 #if U_ICU_VERSION_MAJOR_NUM <= 60
41  auto s_ptr = data(str);
42  int32_t idx = i;
43  int32_t len = size(str);
44  U8_NEXT(s_ptr, idx, len, cp);
45  i = idx;
46 #else
47  auto len = size(str);
48  U8_NEXT(str, i, len, cp);
49 #endif
50 }
51 
52 template <class Range>
53 auto u8_advance_index(const Range& str, size_t& i) -> void
54 {
55  using std::size;
56  auto len = size(str);
57  U8_FWD_1(str, i, len);
58 }
59 
60 template <class Range>
61 auto u8_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
62 {
63  using std::size, std::data;
64  auto ptr = data(str);
65  int32_t idx = i;
66  U8_PREV(ptr, 0, idx, cp);
67  i = idx;
68 }
69 
70 template <class Range>
71 auto u8_reverse_index(const Range& str, size_t& i) -> void
72 {
73  using std::size, std::data;
74  auto ptr = data(str);
75  int32_t idx = i;
76  U8_BACK_1(ptr, 0, idx);
77  i = idx;
78 }
79 
80 template <class Range>
81 auto u8_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
82  -> void
83 {
84  using std::size, std::data;
85 #if U_ICU_VERSION_MAJOR_NUM <= 60
86  auto ptr = data(buf);
87  int32_t idx = i;
88  int32_t len = size(buf);
89  U8_APPEND(buf, idx, len, cp, error);
90  i = idx;
91 #else
92  auto len = size(buf);
93  U8_APPEND(buf, i, len, cp, error);
94 #endif
95 }
96 
97 // UTF-8, valid
98 
99 template <class Range>
100 auto valid_u8_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
101 {
102  U8_NEXT_UNSAFE(str, i, cp);
103 }
104 
105 template <class Range>
106 auto valid_u8_advance_index(const Range& str, size_t& i) -> void
107 {
108  U8_FWD_1_UNSAFE(str, i);
109 }
110 
111 template <class Range>
112 auto valid_u8_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
113 {
114  U8_PREV_UNSAFE(str, i, cp);
115 }
116 
117 template <class Range>
118 auto valid_u8_reverse_index(const Range& str, size_t& i) -> void
119 {
120  U8_BACK_1_UNSAFE(str, i);
121 }
122 
123 template <class Range>
124 auto valid_u8_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
125 {
126  U8_APPEND_UNSAFE(buf, i, cp);
127 }
128 
129 // UTF-16, work on malformed
130 
131 inline constexpr auto u16_max_cp_length = U16_MAX_LENGTH;
132 
133 auto inline u16_is_cp_error(int32_t cp) -> bool { return U_IS_SURROGATE(cp); }
134 
135 template <class Range>
136 auto u16_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
137 {
138  using std::size;
139  auto len = size(str);
140  U16_NEXT(str, i, len, cp);
141 }
142 
143 template <class Range>
144 auto u16_advance_index(const Range& str, size_t& i) -> void
145 {
146  using std::size;
147  auto len = size(str);
148  U16_FWD_1(str, i, len);
149 }
150 
151 template <class Range>
152 auto u16_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
153 {
154  U16_PREV(str, 0, i, cp);
155 }
156 
157 template <class Range>
158 auto u16_reverse_index(const Range& str, size_t& i) -> void
159 {
160  U16_BACK_1(str, 0, i);
161 }
162 
163 template <class Range>
164 auto u16_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
165  -> void
166 {
167  using std::size;
168  auto len = size(buf);
169  U16_APPEND(buf, i, len, cp, error);
170 }
171 
172 // UTF-16, valid
173 
174 template <class Range>
175 auto valid_u16_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
176 {
177  U16_NEXT_UNSAFE(str, i, cp);
178 }
179 
180 template <class Range>
181 auto valid_u16_advance_index(const Range& str, size_t& i) -> void
182 {
183  U16_FWD_1_UNSAFE(str, i);
184 }
185 
186 template <class Range>
187 auto valid_u16_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
188 {
189  U16_PREV_UNSAFE(str, i, cp);
190 }
191 
192 template <class Range>
193 auto valid_u16_reverse_index(const Range& str, size_t& i) -> void
194 {
195  U16_BACK_1_UNSAFE(str, i);
196 }
197 
198 template <class Range>
199 auto valid_u16_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
200 {
201  U16_APPEND_UNSAFE(buf, i, cp);
202 }
203 
204 // higer level funcs
205 
206 struct U8_CP_Pos {
207  size_t begin_i = 0;
208  size_t end_i = begin_i;
209 };
210 
212  char d[u8_max_cp_length];
213  int sz;
214 
215  public:
216  explicit U8_Encoded_CP(std::string_view str, U8_CP_Pos pos)
217  : sz(pos.end_i - pos.begin_i)
218  {
219  auto i = sz;
220  auto j = pos.end_i;
221  auto max_len = 4;
222  do {
223  d[--i] = str[--j];
224  } while (i && --max_len);
225  }
226  U8_Encoded_CP(char32_t cp)
227  {
228  size_t z = 0;
229  valid_u8_write_cp_and_advance(d, z, cp);
230  sz = z;
231  }
232  auto size() const noexcept -> size_t { return sz; }
233  auto data() const noexcept -> const char* { return d; }
234  operator std::string_view() const noexcept
235  {
236  return std::string_view(data(), size());
237  }
238  auto copy_to(std::string& str, size_t j) const
239  {
240  auto i = sz;
241  j += sz;
242  auto max_len = 4;
243  do {
244  str[--j] = d[--i];
245  } while (i && --max_len);
246  }
247 };
248 
249 auto inline u8_swap_adjacent_cp(std::string& str, size_t i1, size_t i2,
250  size_t i3) -> size_t
251 {
252  auto cp1 = U8_Encoded_CP(str, {i1, i2});
253  auto cp2 = U8_Encoded_CP(str, {i2, i3});
254  auto new_i2 = i1 + std::size(cp2);
255  cp1.copy_to(str, new_i2);
256  cp2.copy_to(str, i1);
257  return new_i2;
258 }
259 
260 auto inline u8_swap_cp(std::string& str, U8_CP_Pos pos1, U8_CP_Pos pos2)
261  -> std::pair<size_t, size_t>
262 {
263  using std::size;
264  auto cp1 = U8_Encoded_CP(str, pos1);
265  auto cp2 = U8_Encoded_CP(str, pos2);
266  auto new_p1_end_i = pos1.begin_i + size(cp2);
267  auto new_p2_begin_i = pos2.end_i - size(cp1);
268  std::char_traits<char>::move(&str[new_p1_end_i], &str[pos1.end_i],
269  pos2.begin_i - pos1.end_i);
270  cp2.copy_to(str, pos1.begin_i);
271  cp1.copy_to(str, new_p2_begin_i);
272  return {new_p1_end_i, new_p2_begin_i};
273 }
274 
275 // bellow go func without out-parametars
276 
277 // UTF-8, can be malformed, no out-parametars
278 
280  size_t end_i;
281  int32_t cp;
282 };
283 
285  size_t begin_i;
286  int32_t cp;
287 };
288 
290  size_t end_i;
291  bool error;
292 };
293 
294 template <class Range>
295 [[nodiscard]] auto u8_next_cp(const Range& str, size_t i) -> Idx_And_Next_CP
296 {
297  int32_t cp;
298  u8_advance_cp(str, i, cp);
299  return {i, cp};
300 }
301 
302 template <class Range>
303 [[nodiscard]] auto u8_next_index(const Range& str, size_t i) -> size_t
304 {
305  u8_advance_index(str, i);
306  return i;
307 }
308 
309 template <class Range>
310 [[nodiscard]] auto u8_prev_cp(const Range& str, size_t i) -> Idx_And_Prev_CP
311 {
312  int32_t cp;
313  u8_reverse_cp(str, i, cp);
314  return {i, cp};
315 }
316 
317 template <class Range>
318 [[nodiscard]] auto u8_prev_index(const Range& str, size_t i) -> size_t
319 {
320  u8_reverse_index(str, i);
321  return i;
322 }
323 
324 template <class Range>
325 [[nodiscard]] auto u8_write_cp(Range& buf, size_t i, int32_t cp)
326  -> Write_CP_Idx_and_Error
327 {
328  bool err;
329  u8_write_cp_and_advance(buf, i, cp, err);
330  return {i, err};
331 }
332 
333 // UTF-8, valid, no out-parametars
334 
336  size_t end_i;
337  char32_t cp;
338 };
339 
341  size_t begin_i;
342  char32_t cp;
343 };
344 
345 template <class Range>
346 [[nodiscard]] auto valid_u8_next_cp(const Range& str, size_t i)
348 {
349  char32_t cp;
350  valid_u8_advance_cp(str, i, cp);
351  return {i, cp};
352 }
353 
354 template <class Range>
355 [[nodiscard]] auto valid_u8_next_index(const Range& str, size_t i) -> size_t
356 {
357  valid_u8_advance_index(str, i);
358  return i;
359 }
360 
361 template <class Range>
362 [[nodiscard]] auto valid_u8_prev_cp(const Range& str, size_t i)
363  -> Idx_And_Prev_CP_Valid
364 {
365  char32_t cp;
366  valid_u8_reverse_cp(str, i, cp);
367  return {i, cp};
368 }
369 
370 template <class Range>
371 [[nodiscard]] auto valid_u8_prev_index(const Range& str, size_t i) -> size_t
372 {
373  valid_u8_reverse_index(str, i);
374  return i;
375 }
376 
377 template <class Range>
378 [[nodiscard]] auto valid_u8_write_cp(Range& buf, size_t i, int32_t cp) -> size_t
379 {
380  valid_u8_write_cp_and_advance(buf, i, cp);
381  return i;
382 }
383 NUSPELL_END_INLINE_NAMESPACE
384 } // namespace nuspell
385 #endif // NUSPELL_UNICODE_HXX
Definition: unicode.hxx:211
Library main namespace.
Definition: aff_data.cxx:33
Definition: unicode.hxx:335
Definition: unicode.hxx:279
Definition: unicode.hxx:340
Definition: unicode.hxx:284
Definition: unicode.hxx:206
Definition: unicode.hxx:289