nuspell 5.1.6
Nuspell spellchecking library
Loading...
Searching...
No Matches
unicode.hxx
1/* Copyright 2021-2024 Dimitrij Mijoski
2 *
3 * This file is part of Nuspell.
4 *
5 * Nuspell is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * Nuspell is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17 */
18#ifndef NUSPELL_UNICODE_HXX
19#define NUSPELL_UNICODE_HXX
20#include "defines.hxx"
21
22#include <string>
23#include <string_view>
24#include <unicode/utf16.h>
25#include <unicode/utf8.h>
26
27namespace nuspell {
28NUSPELL_BEGIN_INLINE_NAMESPACE
29
30// UTF-8, work on malformed
31
32inline constexpr auto u8_max_cp_length = U8_MAX_LENGTH;
33
34auto inline u8_is_cp_error(int32_t cp) -> bool { return cp < 0; }
35
36template <class Range>
37auto u8_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
38{
39 using std::size, std::data;
40#if U_ICU_VERSION_MAJOR_NUM <= 60
41 auto s_ptr = data(str);
42 int32_t idx = i;
43 int32_t len = size(str);
44 U8_NEXT(s_ptr, idx, len, cp);
45 i = idx;
46#else
47 auto len = size(str);
48 U8_NEXT(str, i, len, cp);
49#endif
50}
51
52template <class Range>
53auto u8_advance_index(const Range& str, size_t& i) -> void
54{
55 using std::size;
56 auto len = size(str);
57 U8_FWD_1(str, i, len);
58}
59
60template <class Range>
61auto u8_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
62{
63 using std::size, std::data;
64 auto ptr = data(str);
65 int32_t idx = i;
66 U8_PREV(ptr, 0, idx, cp);
67 i = idx;
68}
69
70template <class Range>
71auto u8_reverse_index(const Range& str, size_t& i) -> void
72{
73 using std::size, std::data;
74 auto ptr = data(str);
75 int32_t idx = i;
76 U8_BACK_1(ptr, 0, idx);
77 i = idx;
78}
79
80template <class Range>
81auto u8_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
82 -> void
83{
84 using std::size, std::data;
85#if U_ICU_VERSION_MAJOR_NUM <= 60
86 auto ptr = data(buf);
87 int32_t idx = i;
88 int32_t len = size(buf);
89 U8_APPEND(buf, idx, len, cp, error);
90 i = idx;
91#else
92 auto len = size(buf);
93 U8_APPEND(buf, i, len, cp, error);
94#endif
95}
96
97// UTF-8, valid
98
99template <class Range>
100auto valid_u8_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
101{
102 U8_NEXT_UNSAFE(str, i, cp);
103}
104
105template <class Range>
106auto valid_u8_advance_index(const Range& str, size_t& i) -> void
107{
108 U8_FWD_1_UNSAFE(str, i);
109}
110
111template <class Range>
112auto valid_u8_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
113{
114 U8_PREV_UNSAFE(str, i, cp);
115}
116
117template <class Range>
118auto valid_u8_reverse_index(const Range& str, size_t& i) -> void
119{
120 U8_BACK_1_UNSAFE(str, i);
121}
122
123template <class Range>
124auto valid_u8_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
125{
126 U8_APPEND_UNSAFE(buf, i, cp);
127}
128
129// UTF-16, work on malformed
130
131inline constexpr auto u16_max_cp_length = U16_MAX_LENGTH;
132
133auto inline u16_is_cp_error(int32_t cp) -> bool { return U_IS_SURROGATE(cp); }
134
135template <class Range>
136auto u16_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
137{
138 using std::size;
139 auto len = size(str);
140 U16_NEXT(str, i, len, cp);
141}
142
143template <class Range>
144auto u16_advance_index(const Range& str, size_t& i) -> void
145{
146 using std::size;
147 auto len = size(str);
148 U16_FWD_1(str, i, len);
149}
150
151template <class Range>
152auto u16_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
153{
154 U16_PREV(str, 0, i, cp);
155}
156
157template <class Range>
158auto u16_reverse_index(const Range& str, size_t& i) -> void
159{
160 U16_BACK_1(str, 0, i);
161}
162
163template <class Range>
164auto u16_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
165 -> void
166{
167 using std::size;
168 auto len = size(buf);
169 U16_APPEND(buf, i, len, cp, error);
170}
171
172// UTF-16, valid
173
174template <class Range>
175auto valid_u16_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
176{
177 U16_NEXT_UNSAFE(str, i, cp);
178}
179
180template <class Range>
181auto valid_u16_advance_index(const Range& str, size_t& i) -> void
182{
183 U16_FWD_1_UNSAFE(str, i);
184}
185
186template <class Range>
187auto valid_u16_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
188{
189 U16_PREV_UNSAFE(str, i, cp);
190}
191
192template <class Range>
193auto valid_u16_reverse_index(const Range& str, size_t& i) -> void
194{
195 U16_BACK_1_UNSAFE(str, i);
196}
197
198template <class Range>
199auto valid_u16_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
200{
201 U16_APPEND_UNSAFE(buf, i, cp);
202}
203
204// higer level funcs
205
206struct U8_CP_Pos {
207 size_t begin_i = 0;
208 size_t end_i = begin_i;
209};
210
211class U8_Encoded_CP {
212 char d[u8_max_cp_length];
213 int sz;
214
215 public:
216 explicit U8_Encoded_CP(std::string_view str, U8_CP_Pos pos)
217 : sz(pos.end_i - pos.begin_i)
218 {
219 auto i = sz;
220 auto j = pos.end_i;
221 auto max_len = 4;
222 do {
223 d[--i] = str[--j];
224 } while (i && --max_len);
225 }
226 U8_Encoded_CP(char32_t cp)
227 {
228 size_t z = 0;
229 valid_u8_write_cp_and_advance(d, z, cp);
230 sz = z;
231 }
232 auto size() const noexcept -> size_t { return sz; }
233 auto data() const noexcept -> const char* { return d; }
234 operator std::string_view() const noexcept
235 {
236 return std::string_view(data(), size());
237 }
238 auto copy_to(std::string& str, size_t j) const
239 {
240 auto i = sz;
241 j += sz;
242 auto max_len = 4;
243 do {
244 str[--j] = d[--i];
245 } while (i && --max_len);
246 }
247};
248
249auto inline u8_swap_adjacent_cp(std::string& str, size_t i1, size_t i2,
250 size_t i3) -> size_t
251{
252 auto cp1 = U8_Encoded_CP(str, {i1, i2});
253 auto cp2 = U8_Encoded_CP(str, {i2, i3});
254 auto new_i2 = i1 + std::size(cp2);
255 cp1.copy_to(str, new_i2);
256 cp2.copy_to(str, i1);
257 return new_i2;
258}
259
260auto inline u8_swap_cp(std::string& str, U8_CP_Pos pos1, U8_CP_Pos pos2)
261 -> std::pair<size_t, size_t>
262{
263 using std::size;
264 auto cp1 = U8_Encoded_CP(str, pos1);
265 auto cp2 = U8_Encoded_CP(str, pos2);
266 auto new_p1_end_i = pos1.begin_i + size(cp2);
267 auto new_p2_begin_i = pos2.end_i - size(cp1);
268 std::char_traits<char>::move(&str[new_p1_end_i], &str[pos1.end_i],
269 pos2.begin_i - pos1.end_i);
270 cp2.copy_to(str, pos1.begin_i);
271 cp1.copy_to(str, new_p2_begin_i);
272 return {new_p1_end_i, new_p2_begin_i};
273}
274
275// bellow go func without out-parametars
276
277// UTF-8, can be malformed, no out-parametars
278
279struct Idx_And_Next_CP {
280 size_t end_i;
281 int32_t cp;
282};
283
284struct Idx_And_Prev_CP {
285 size_t begin_i;
286 int32_t cp;
287};
288
289struct Write_CP_Idx_and_Error {
290 size_t end_i;
291 bool error;
292};
293
294template <class Range>
295[[nodiscard]] auto u8_next_cp(const Range& str, size_t i) -> Idx_And_Next_CP
296{
297 int32_t cp;
298 u8_advance_cp(str, i, cp);
299 return {i, cp};
300}
301
302template <class Range>
303[[nodiscard]] auto u8_next_index(const Range& str, size_t i) -> size_t
304{
305 u8_advance_index(str, i);
306 return i;
307}
308
309template <class Range>
310[[nodiscard]] auto u8_prev_cp(const Range& str, size_t i) -> Idx_And_Prev_CP
311{
312 int32_t cp;
313 u8_reverse_cp(str, i, cp);
314 return {i, cp};
315}
316
317template <class Range>
318[[nodiscard]] auto u8_prev_index(const Range& str, size_t i) -> size_t
319{
320 u8_reverse_index(str, i);
321 return i;
322}
323
324template <class Range>
325[[nodiscard]] auto u8_write_cp(Range& buf, size_t i, int32_t cp)
326 -> Write_CP_Idx_and_Error
327{
328 bool err;
329 u8_write_cp_and_advance(buf, i, cp, err);
330 return {i, err};
331}
332
333// UTF-8, valid, no out-parametars
334
335struct Idx_And_Next_CP_Valid {
336 size_t end_i;
337 char32_t cp;
338};
339
340struct Idx_And_Prev_CP_Valid {
341 size_t begin_i;
342 char32_t cp;
343};
344
345template <class Range>
346[[nodiscard]] auto valid_u8_next_cp(const Range& str, size_t i)
347 -> Idx_And_Next_CP_Valid
348{
349 char32_t cp;
350 valid_u8_advance_cp(str, i, cp);
351 return {i, cp};
352}
353
354template <class Range>
355[[nodiscard]] auto valid_u8_next_index(const Range& str, size_t i) -> size_t
356{
357 valid_u8_advance_index(str, i);
358 return i;
359}
360
361template <class Range>
362[[nodiscard]] auto valid_u8_prev_cp(const Range& str, size_t i)
363 -> Idx_And_Prev_CP_Valid
364{
365 char32_t cp;
366 valid_u8_reverse_cp(str, i, cp);
367 return {i, cp};
368}
369
370template <class Range>
371[[nodiscard]] auto valid_u8_prev_index(const Range& str, size_t i) -> size_t
372{
373 valid_u8_reverse_index(str, i);
374 return i;
375}
376
377template <class Range>
378[[nodiscard]] auto valid_u8_write_cp(Range& buf, size_t i, int32_t cp) -> size_t
379{
380 valid_u8_write_cp_and_advance(buf, i, cp);
381 return i;
382}
383NUSPELL_END_INLINE_NAMESPACE
384} // namespace nuspell
385#endif // NUSPELL_UNICODE_HXX
Library main namespace.
Definition aff_data.cxx:33