nuspell 5.1.6
Nuspell spellchecking library
Loading...
Searching...
No Matches
structures.hxx
1/* Copyright 2018-2024 Dimitrij Mijoski
2 *
3 * This file is part of Nuspell.
4 *
5 * Nuspell is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * Nuspell is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19#ifndef NUSPELL_STRUCTURES_HXX
20#define NUSPELL_STRUCTURES_HXX
21
22#include "unicode.hxx"
23
24#include <algorithm>
25#include <cmath>
26#include <forward_list>
27#include <functional>
28#include <iterator>
29#include <stack>
30#include <stdexcept>
31#include <string>
32#include <string_view>
33#include <utility>
34#include <vector>
35
36namespace nuspell {
37NUSPELL_BEGIN_INLINE_NAMESPACE
38
39template <class It>
40class Subrange {
41 using Iter_Category =
42 typename std::iterator_traits<It>::iterator_category;
43 It a = {};
44 It b = {};
45
46 public:
47 Subrange() = default;
48 Subrange(std::pair<It, It> p) : a(p.first), b(p.second) {}
49 Subrange(It first, It last) : a(first), b(last) {}
50 template <class Range>
51 Subrange(Range& r) : a(r.begin()), b(r.end())
52 {
53 }
54 auto begin() const -> It { return a; }
55 auto end() const -> It { return b; }
56};
57// CTAD
58template <class Range>
59Subrange(const Range& r) -> Subrange<typename Range::const_iterator>;
60template <class Range>
61Subrange(Range& r) -> Subrange<typename Range::iterator>;
62
70template <class CharT>
71class String_Set {
72 private:
73 std::basic_string<CharT> d;
74 auto sort_uniq() -> void
75 {
76 auto first = begin();
77 auto last = end();
78 using t = traits_type;
79 sort(first, last, t::lt);
80 d.erase(unique(first, last, t::eq), last);
81 }
82 struct Char_Traits_Less_Than {
83 auto operator()(CharT a, CharT b) const noexcept
84 {
85 return traits_type::lt(a, b);
86 }
87 };
88
89 public:
90 using Str = std::basic_string<CharT>;
91 using traits_type = typename Str::traits_type;
92
93 using key_type = typename Str::value_type;
94 using key_compare = Char_Traits_Less_Than;
95 using value_type = typename Str::value_type;
96 using value_compare = key_compare;
97 using allocator_type = typename Str::allocator_type;
98 using pointer = typename Str::pointer;
99 using const_pointer = typename Str::const_pointer;
100 using reference = typename Str::reference;
101 using const_reference = typename Str::const_reference;
102 using size_type = typename Str::size_type;
103 using difference_type = typename Str::difference_type;
104 using iterator = typename Str::iterator;
105 using const_iterator = typename Str::const_iterator;
106 using reverse_iterator = typename Str::reverse_iterator;
107 using const_reverse_iterator = typename Str::const_reverse_iterator;
108
109 String_Set() = default;
110 explicit String_Set(const Str& s) : d(s) { sort_uniq(); }
111 explicit String_Set(Str&& s) : d(move(s)) { sort_uniq(); }
112 explicit String_Set(const CharT* s) : d(s) { sort_uniq(); }
113 template <class InputIterator>
114 String_Set(InputIterator first, InputIterator last) : d(first, last)
115 {
116 sort_uniq();
117 }
118 String_Set(std::initializer_list<value_type> il) : d(il)
119 {
120 sort_uniq();
121 }
122
123 auto& operator=(const Str& s)
124 {
125 d = s;
126 sort_uniq();
127 return *this;
128 }
129 auto& operator=(Str&& s)
130 {
131 d = move(s);
132 sort_uniq();
133 return *this;
134 }
135 auto& operator=(std::initializer_list<value_type> il)
136 {
137 d = il;
138 sort_uniq();
139 return *this;
140 }
141 auto& operator=(const CharT* s)
142 {
143 d = s;
144 sort_uniq();
145 return *this;
146 }
147
148 // non standard underlying storage access:
149 auto data() const noexcept -> const_pointer { return d.data(); }
150 operator const Str&() const noexcept { return d; }
151 auto& str() const noexcept { return d; }
152
153 // iterators:
154 iterator begin() noexcept { return d.begin(); }
155 const_iterator begin() const noexcept { return d.begin(); }
156 iterator end() noexcept { return d.end(); }
157 const_iterator end() const noexcept { return d.end(); }
158
159 reverse_iterator rbegin() noexcept { return d.rbegin(); }
160 const_reverse_iterator rbegin() const noexcept { return d.rbegin(); }
161 reverse_iterator rend() noexcept { return d.rend(); }
162 const_reverse_iterator rend() const noexcept { return d.rend(); }
163
164 const_iterator cbegin() const noexcept { return d.cbegin(); }
165 const_iterator cend() const noexcept { return d.cend(); }
166 const_reverse_iterator crbegin() const noexcept { return d.crbegin(); }
167 const_reverse_iterator crend() const noexcept { return d.crend(); }
168
169 // capacity:
170 bool empty() const noexcept { return d.empty(); }
171 size_type size() const noexcept { return d.size(); }
172 size_type max_size() const noexcept { return d.max_size(); }
173
174 // modifiers:
175 std::pair<iterator, bool> insert(value_type x)
176 {
177 auto it = lower_bound(x);
178 if (it != end() && *it == x) {
179 return {it, false};
180 }
181 auto ret = d.insert(it, x);
182 return {ret, true};
183 }
184 iterator insert(iterator hint, value_type x)
185 {
186 if (hint == end() || traits_type::lt(x, *hint)) {
187 if (hint == begin() ||
188 traits_type::lt(*(hint - 1), x)) {
189 return d.insert(hint, x);
190 }
191 }
192 return insert(x).first;
193 }
194
195 // iterator insert(const_iterator hint, value_type&& x);
196 template <class InputIterator>
197 void insert(InputIterator first, InputIterator last)
198 {
199 d.insert(end(), first, last);
200 sort_uniq();
201 }
202 void insert(std::initializer_list<value_type> il)
203 {
204 d.insert(end(), il);
205 sort_uniq();
206 }
207 template <class... Args>
208 std::pair<iterator, bool> emplace(Args&&... args)
209 {
210 return insert(CharT(args...));
211 }
212
213 template <class... Args>
214 iterator emplace_hint(iterator hint, Args&&... args)
215 {
216 return insert(hint, CharT(args...));
217 }
218
219 iterator erase(iterator position) { return d.erase(position); }
220 size_type erase(key_type x)
221 {
222 auto i = d.find(x);
223 if (i != d.npos) {
224 d.erase(i, 1);
225 return true;
226 }
227 return false;
228 }
229 iterator erase(iterator first, iterator last)
230 {
231 return d.erase(first, last);
232 }
233 void swap(String_Set& s) { d.swap(s.d); }
234 void clear() noexcept { d.clear(); }
235
236 // non standrd modifiers:
237 auto insert(const Str& s) -> void
238 {
239 d += s;
240 sort_uniq();
241 }
242 auto& operator+=(const Str& s)
243 {
244 insert(s);
245 return *this;
246 }
247
248 // observers:
249 key_compare key_comp() const { return Char_Traits_Less_Than(); }
250 value_compare value_comp() const { return key_comp(); }
251
252 // set operations:
253 private:
254 auto lookup(key_type x) const
255 {
256 auto i = d.find(x);
257 if (i == d.npos)
258 i = d.size();
259 return i;
260 }
261
262 public:
263 iterator find(key_type x) { return begin() + lookup(x); }
264 const_iterator find(key_type x) const { return begin() + lookup(x); }
265 size_type count(key_type x) const { return d.find(x) != d.npos; }
266
267 iterator lower_bound(key_type x)
268 {
269 return std::lower_bound(begin(), end(), x, traits_type::lt);
270 }
271
272 const_iterator lower_bound(key_type x) const
273 {
274 return std::lower_bound(begin(), end(), x, traits_type::lt);
275 }
276 iterator upper_bound(key_type x)
277 {
278 return std::upper_bound(begin(), end(), x, traits_type::lt);
279 }
280
281 const_iterator upper_bound(key_type x) const
282 {
283 return std::upper_bound(begin(), end(), x, traits_type::lt);
284 }
285 std::pair<iterator, iterator> equal_range(key_type x)
286 {
287 return std::equal_range(begin(), end(), x, traits_type::lt);
288 }
289
290 std::pair<const_iterator, const_iterator> equal_range(key_type x) const
291 {
292 return std::equal_range(begin(), end(), x, traits_type::lt);
293 }
294
295 // non standard set operations:
296 bool contains(key_type x) const { return count(x); }
297
298 // compare
299 bool operator<(const String_Set& rhs) const { return d < rhs.d; }
300 bool operator<=(const String_Set& rhs) const { return d <= rhs.d; }
301 bool operator==(const String_Set& rhs) const { return d == rhs.d; }
302 bool operator!=(const String_Set& rhs) const { return d != rhs.d; }
303 bool operator>=(const String_Set& rhs) const { return d >= rhs.d; }
304 bool operator>(const String_Set& rhs) const { return d > rhs.d; }
305};
306
307template <class CharT>
308auto swap(String_Set<CharT>& a, String_Set<CharT>& b)
309{
310 a.swap(b);
311}
312
313using Flag_Set = String_Set<char16_t>;
314
315class Substr_Replacer {
316 public:
317 using Str = std::string;
318 using Str_View = std::string_view;
319 using Pair_Str = std::pair<Str, Str>;
320 using Table_Pairs = std::vector<Pair_Str>;
321
322 private:
323 Table_Pairs table;
324 auto sort_uniq() -> void;
325 auto find_match(Str_View s) const;
326
327 public:
328 Substr_Replacer() = default;
329 explicit Substr_Replacer(const Table_Pairs& v) : table(v)
330 {
331 sort_uniq();
332 }
333 explicit Substr_Replacer(Table_Pairs&& v) : table(std::move(v))
334 {
335 sort_uniq();
336 }
337
338 auto& operator=(const Table_Pairs& v)
339 {
340 table = v;
341 sort_uniq();
342 return *this;
343 }
344 auto& operator=(Table_Pairs&& v)
345 {
346 table = std::move(v);
347 sort_uniq();
348 return *this;
349 }
350
351 auto replace(Str& s) const -> Str&;
352 auto replace_copy(Str s) const -> Str
353 {
354 replace(s);
355 return s;
356 }
357};
358
359auto inline Substr_Replacer::sort_uniq() -> void
360{
361 auto first = begin(table);
362 auto last = end(table);
363 sort(first, last, [](auto& a, auto& b) { return a.first < b.first; });
364 auto it = unique(first, last,
365 [](auto& a, auto& b) { return a.first == b.first; });
366 table.erase(it, last);
367
368 // remove empty key ""
369 if (!table.empty() && table.front().first.empty())
370 table.erase(begin(table));
371}
372
373auto inline Substr_Replacer::find_match(Str_View s) const
374{
375 auto& t = table;
376 struct Comparer_Str_Rep {
377 auto static cmp_prefix_of(const Str& p, Str_View of)
378 {
379 return p.compare(0, p.npos, of.data(),
380 std::min(p.size(), of.size()));
381 }
382 auto operator()(const Pair_Str& a, Str_View b)
383 {
384 return cmp_prefix_of(a.first, b) < 0;
385 }
386 auto operator()(Str_View a, const Pair_Str& b)
387 {
388 return cmp_prefix_of(b.first, a) > 0;
389 }
390 auto static eq(const Pair_Str& a, Str_View b)
391 {
392 return cmp_prefix_of(a.first, b) == 0;
393 }
394 };
395 Comparer_Str_Rep csr;
396 auto it = begin(t);
397 auto last_match = end(t);
398 for (;;) {
399 auto it2 = upper_bound(it, end(t), s, csr);
400 if (it2 == it) {
401 // not found, s is smaller that the range
402 break;
403 }
404 --it2;
405 if (csr.eq(*it2, s)) {
406 // Match found. Try another search maybe for
407 // longer.
408 last_match = it2;
409 it = ++it2;
410 }
411 else {
412 // not found, s is greater that the range
413 break;
414 }
415 }
416 return last_match;
417}
418
419auto inline Substr_Replacer::replace(Str& s) const -> Str&
420{
421
422 if (table.empty())
423 return s;
424 for (size_t i = 0; i < s.size(); /*no increment here*/) {
425 auto substr = Str_View(&s[i], s.size() - i);
426 auto it = find_match(substr);
427 if (it != end(table)) {
428 auto& match = *it;
429 // match found. match.first is the found string,
430 // match.second is the replacement.
431 s.replace(i, match.first.size(), match.second);
432 i += match.second.size();
433 continue;
434 }
435 ++i;
436 }
437 return s;
438}
439
440class Break_Table {
441 public:
442 using Str = std::string;
443 using Table_Str = std::vector<Str>;
444 using iterator = typename Table_Str::iterator;
445 using const_iterator = typename Table_Str::const_iterator;
446
447 private:
448 Table_Str table;
449 size_t start_word_breaks_last_idx = 0;
450 size_t end_word_breaks_last_idx = 0;
451
452 auto order_entries() -> void;
453
454 public:
455 Break_Table() = default;
456 explicit Break_Table(const Table_Str& v) : table(v) { order_entries(); }
457 explicit Break_Table(Table_Str&& v) : table(std::move(v))
458 {
459 order_entries();
460 }
461
462 auto& operator=(const Table_Str& v)
463 {
464 table = v;
465 order_entries();
466 return *this;
467 }
468
469 auto& operator=(Table_Str&& v)
470 {
471 table = std::move(v);
472 order_entries();
473 return *this;
474 }
475
476 auto start_word_breaks() const -> Subrange<const_iterator>
477 {
478 return {begin(table),
479 begin(table) + start_word_breaks_last_idx};
480 }
481 auto end_word_breaks() const -> Subrange<const_iterator>
482 {
483 return {begin(table) + start_word_breaks_last_idx,
484 begin(table) + end_word_breaks_last_idx};
485 }
486 auto middle_word_breaks() const -> Subrange<const_iterator>
487 {
488 return {begin(table) + end_word_breaks_last_idx, end(table)};
489 }
490};
491auto inline Break_Table::order_entries() -> void
492{
493 auto it = remove_if(begin(table), end(table), [](auto& s) {
494 return s.empty() ||
495 (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
496 });
497 table.erase(it, end(table));
498
499 auto is_start_word_break = [=](auto& x) { return x[0] == '^'; };
500 auto is_end_word_break = [=](auto& x) { return x.back() == '$'; };
501 auto start_word_breaks_last =
502 partition(begin(table), end(table), is_start_word_break);
503 start_word_breaks_last_idx = start_word_breaks_last - begin(table);
504
505 for_each(begin(table), start_word_breaks_last,
506 [](auto& e) { e.erase(0, 1); });
507
508 auto end_word_breaks_last =
509 partition(start_word_breaks_last, end(table), is_end_word_break);
510 end_word_breaks_last_idx = end_word_breaks_last - begin(table);
511
512 for_each(start_word_breaks_last, end_word_breaks_last,
513 [](auto& e) { e.pop_back(); });
514}
515
516struct identity {
517 template <class T>
518 constexpr auto operator()(T&& t) const noexcept -> T&&
519 {
520 return std::forward<T>(t);
521 }
522};
523
524template <class Key, class T>
525class Hash_Multimap {
526 using bucket_type = std::forward_list<std::pair<Key, T>>;
527 static constexpr float max_load_fact = 7.0 / 8.0;
528 std::vector<bucket_type> data;
529 size_t sz = 0;
530 size_t max_load_factor_capacity = 0;
531
532 public:
533 using key_type = Key;
534 using mapped_type = T;
535 using value_type = std::pair<Key, T>;
536 using size_type = std::size_t;
537 using difference_type = std::ptrdiff_t;
538 using hasher = std::hash<Key>;
539 using reference = value_type&;
540 using const_reference = const value_type&;
541 using pointer = typename bucket_type::pointer;
542 using const_pointer = typename bucket_type::const_pointer;
543 using local_iterator = typename bucket_type::iterator;
544 using local_const_iterator = typename bucket_type::const_iterator;
545
546 Hash_Multimap() = default;
547
548 auto size() const noexcept { return sz; }
549 auto empty() const noexcept { return size() == 0; }
550
551 auto rehash(size_t count)
552 {
553 if (empty()) {
554 size_t capacity = 16;
555 while (capacity <= count)
556 capacity <<= 1;
557 data.resize(capacity);
558 max_load_factor_capacity =
559 std::ceil(capacity * max_load_fact);
560 return;
561 }
562 if (count < size() / max_load_fact)
563 count = size() / max_load_fact;
564 auto n = Hash_Multimap();
565 n.rehash(count);
566 for (auto& b : data) {
567 for (auto& x : b) {
568 n.insert(std::move(x));
569 }
570 }
571 data.swap(n.data);
572 sz = n.sz;
573 max_load_factor_capacity = n.max_load_factor_capacity;
574 }
575
576 auto reserve(size_t count) -> void
577 {
578 rehash(std::ceil(count / max_load_fact));
579 }
580
581 private:
582 auto prepare_for_insert(const key_type& key)
583 -> std::pair<bucket_type&, local_iterator>
584 {
585 auto hash = hasher();
586 if (sz == max_load_factor_capacity) {
587 reserve(sz + 1);
588 }
589 auto h = hash(key);
590 auto h_mod = h & (data.size() - 1);
591 auto& bucket = data[h_mod];
592 auto prev = bucket.before_begin();
593 auto curr = begin(bucket);
594 // find first entry with same key
595 for (; curr != end(bucket); prev = curr++) {
596 if (curr->first == key)
597 break;
598 }
599 // find last entry with same key
600 for (; curr != end(bucket); prev = curr++) {
601 if (curr->first != key)
602 break;
603 }
604 ++sz;
605 return {bucket, prev};
606 }
607
608 public:
609 auto insert(value_type&& value)
610 {
611 auto& key = value.first;
612 auto [bucket, it] = prepare_for_insert(key);
613 return bucket.insert_after(it, move(value));
614 }
615 template <class... Args>
616 auto emplace(Args&&... a)
617 {
618 auto node = bucket_type();
619 auto& val = node.emplace_front(std::forward<Args>(a)...);
620 auto& key = val.first;
621 auto [bucket, it] = prepare_for_insert(key);
622 bucket.splice_after(it, node, node.before_begin());
623 return next(it);
624 }
625
626 auto equal_range(const key_type& key) const
627 -> std::pair<local_const_iterator, local_const_iterator>
628 {
629 auto hash = hasher();
630 if (data.empty())
631 return {};
632 auto h = hash(key);
633 auto h_mod = h & (data.size() - 1);
634 auto& bucket = data[h_mod];
635 auto eq_key = [&](auto& x) { return key == x.first; };
636 auto first = std::find_if(begin(bucket), end(bucket), eq_key);
637 if (first == end(bucket))
638 return {first, first}; // ret empty
639 auto last = std::find_if_not(next(first), end(bucket), eq_key);
640 return {first, last};
641 }
642
643 auto bucket_count() const -> size_type { return data.size(); }
644 auto bucket_data(size_type i) const { return Subrange(data[i]); }
645};
646
647struct Condition_Exception : public std::runtime_error {
648 using std::runtime_error::runtime_error;
649};
650
655class Condition {
656 using Str = std::string;
657 using Str_View = std::string_view;
658
659 Str cond;
660 size_t num_cp = 0;
661
662 auto construct() -> void;
663
664 public:
665 Condition() = default;
666 explicit Condition(const Str& condition) : cond(condition)
667 {
668 construct();
669 }
670 explicit Condition(Str&& condition) : cond(std::move(condition))
671 {
672 construct();
673 }
674 explicit Condition(const char* condition) : cond(condition)
675 {
676 construct();
677 }
678 auto& operator=(const Str& condition)
679 {
680 cond = condition;
681 num_cp = 0;
682 construct();
683 return *this;
684 }
685 auto& operator=(Str&& condition)
686 {
687 cond = std::move(condition);
688 num_cp = 0;
689 construct();
690 return *this;
691 }
692 auto& operator=(const char* condition)
693 {
694 cond = condition;
695 num_cp = 0;
696 construct();
697 return *this;
698 }
699 auto& str() const { return cond; }
700 auto match_prefix(Str_View s) const -> bool;
701 auto match_suffix(Str_View s) const
702 {
703 auto cp_cnt = size_t(0);
704 auto i = size(s);
705 for (; cp_cnt != num_cp && i != 0; ++cp_cnt) {
706 valid_u8_reverse_index(s, i);
707 }
708 if (cp_cnt != num_cp)
709 return false;
710 return match_prefix(s.substr(i));
711 }
712};
713auto inline Condition::construct() -> void
714{
715 for (size_t i = 0; i != size(cond);) {
716 size_t j = cond.find_first_of("[].", i);
717 if (j == cond.npos)
718 j = size(cond);
719 while (i != j) {
720 valid_u8_advance_index(cond, i);
721 ++num_cp;
722 }
723 if (i == size(cond))
724 break;
725 if (cond[i] == '.') {
726 ++num_cp;
727 ++i;
728 continue;
729 }
730 else if (cond[i] == ']') {
731 auto what =
732 "closing bracket has no matching opening bracket";
733 throw Condition_Exception(what);
734 }
735 else if (cond[i] == '[') {
736 ++i;
737 if (i == size(cond)) {
738 auto what = "opening bracket has no matching "
739 "closing bracket";
740 throw Condition_Exception(what);
741 }
742 if (cond[i] == '^')
743 ++i;
744 j = cond.find(']', i);
745 if (j == i) {
746 auto what = "empty bracket expression";
747 throw Condition_Exception(what);
748 }
749 if (j == cond.npos) {
750 auto what = "opening bracket has no matching "
751 "closing bracket";
752 throw Condition_Exception(what);
753 }
754 ++num_cp;
755 i = j + 1;
756 }
757 }
758}
759
760auto inline Condition::match_prefix(Str_View s) const -> bool
761{
762 if (size(s) < num_cp)
763 return false;
764
765 auto s_i = size_t(0);
766 auto cond_i = size_t(0);
767 for (; s_i != size(s) && cond_i != size(cond); ++cond_i) {
768 auto s_cu = s[s_i];
769 auto cond_cu = cond[cond_i];
770 switch (cond_cu) {
771 default:
772 if (s_cu != cond_cu)
773 return false;
774 ++s_i;
775 break;
776 case '.':
777 valid_u8_advance_index(s, s_i);
778 break;
779 case '[':
780 ++cond_i;
781 cond_cu = cond[cond_i];
782 if (cond_cu == '^')
783 ++cond_i;
784 char32_t str_cp;
785 valid_u8_advance_cp(s, s_i, str_cp);
786 auto found = false;
787 while (cond[cond_i] != ']') {
788 char32_t cond_cp;
789 valid_u8_advance_cp(cond, cond_i, cond_cp);
790 if (str_cp == cond_cp)
791 found = true;
792 }
793 if (cond_cu != '^' && !found)
794 return false;
795 if (cond_cu == '^' && found)
796 return false;
797 break;
798 }
799 }
800 return cond_i == size(cond);
801}
802
803struct Prefix {
804 using Str = std::string;
805
806 char16_t flag = 0;
807 bool cross_product = false;
808 Str stripping;
809 Str appending;
810 Flag_Set cont_flags;
811 Condition condition;
812
813 auto to_root(Str& word) const -> Str&
814 {
815 return word.replace(0, appending.size(), stripping);
816 }
817 auto to_root_copy(Str word) const -> Str
818 {
819 to_root(word);
820 return word;
821 }
822
823 auto to_derived(Str& word) const -> Str&
824 {
825 return word.replace(0, stripping.size(), appending);
826 }
827 auto to_derived_copy(Str word) const -> Str
828 {
829 to_derived(word);
830 return word;
831 }
832
833 auto check_condition(const Str& word) const -> bool
834 {
835 return condition.match_prefix(word);
836 }
837};
838
839struct Suffix {
840 using Str = std::string;
841
842 char16_t flag = 0;
843 bool cross_product = false;
844 Str stripping;
845 Str appending;
846 Flag_Set cont_flags;
847 Condition condition;
848
849 auto to_root(Str& word) const -> Str&
850 {
851 return word.replace(word.size() - appending.size(),
852 appending.size(), stripping);
853 }
854 auto to_root_copy(Str word) const -> Str
855 {
856 to_root(word);
857 return word;
858 }
859
860 auto to_derived(Str& word) const -> Str&
861 {
862 return word.replace(word.size() - stripping.size(),
863 stripping.size(), appending);
864 }
865 auto to_derived_copy(Str word) const -> Str
866 {
867 to_derived(word);
868 return word;
869 }
870
871 auto check_condition(const Str& word) const -> bool
872 {
873 return condition.match_suffix(word);
874 }
875};
876
877template <class T, class Key_Extr = identity, class Key_Transform = identity>
878class Prefix_Multiset {
879 public:
880 using Value_Type = T;
881 using Key_Type =
882 std::remove_reference_t<decltype(std::declval<Key_Extr>()(
883 std::declval<T>()))>;
884 using Char_Type = typename Key_Type::value_type;
885 using Traits = typename Key_Type::traits_type;
886 using Vector_Type = std::vector<T>;
887 using Iterator = typename Vector_Type::const_iterator;
888
889 private:
890 struct Ebo_Key_Extr : public Key_Extr {};
891 struct Ebo_Key_Transf : public Key_Transform {};
892 struct Ebo : public Ebo_Key_Extr, Ebo_Key_Transf {
893 Vector_Type table;
894 } ebo;
895 std::basic_string<Char_Type> first_letter;
896 std::vector<size_t> prefix_idx_with_first_letter;
897
898 auto key_extractor() const -> const Ebo_Key_Extr& { return ebo; }
899 auto key_transformator() const -> const Ebo_Key_Transf& { return ebo; }
900 auto& get_table() { return ebo.table; }
901 auto& get_table() const { return ebo.table; }
902
903 auto sort()
904 {
905 auto& extract_key = key_extractor();
906 auto& transform_key = key_transformator();
907 auto& table = get_table();
908
909 std::sort(begin(table), end(table), [&](T& a, T& b) {
910 auto&& key_a = transform_key(extract_key(a));
911 auto&& key_b = transform_key(extract_key(b));
912 return key_a < key_b;
913 });
914
915 first_letter.clear();
916 prefix_idx_with_first_letter.clear();
917 auto first = begin(table);
918 auto last = end(table);
919 auto it = std::find_if_not(first, last, [=](const T& x) {
920 auto&& k = transform_key(extract_key(x));
921 return k.empty();
922 });
923 while (it != last) {
924 auto&& k1 = transform_key(extract_key(*it));
925 first_letter.push_back(k1[0]);
926 prefix_idx_with_first_letter.push_back(it - first);
927
928 it = std::upper_bound(
929 it, last, k1[0],
930 Comparator{0, extract_key, transform_key});
931 }
932 if (!prefix_idx_with_first_letter.empty())
933 prefix_idx_with_first_letter.push_back(last - first);
934 }
935
936 struct Comparator {
937 size_t i;
938 Key_Extr extract_key;
939 Key_Transform transform_key;
940 auto operator()(const T& a, Char_Type b) const
941 {
942 auto&& key_a = transform_key(extract_key(a));
943 return Traits::lt(key_a[i], b);
944 }
945 auto operator()(Char_Type a, const T& b) const
946 {
947 auto&& key_b = transform_key(extract_key(b));
948 return Traits::lt(a, key_b[i]);
949 }
950 };
951
952 public:
953 Prefix_Multiset() = default;
954 explicit Prefix_Multiset(Key_Extr ke, Key_Transform kt = {})
955 : ebo{{ke}, {kt}, {}}
956 {
957 }
958 explicit Prefix_Multiset(const Vector_Type& v, Key_Extr ke = {},
959 Key_Transform kt = {})
960 : ebo{{ke}, {kt}, v}
961 {
962 sort();
963 }
964 explicit Prefix_Multiset(Vector_Type&& v, Key_Extr ke = {},
965 Key_Transform kt = {})
966 : ebo{{ke}, {kt}, std::move(v)}
967 {
968 sort();
969 }
970 Prefix_Multiset(std::initializer_list<T> list, Key_Extr ke = {},
971 Key_Transform kt = {})
972 : ebo{{ke}, {kt}, list}
973 {
974 sort();
975 }
976 auto& operator=(const Vector_Type& v)
977 {
978 get_table() = v;
979 sort();
980 return *this;
981 }
982 auto& operator=(Vector_Type&& v)
983 {
984 get_table() = std::move(v);
985 sort();
986 return *this;
987 }
988 auto& operator=(std::initializer_list<T> list)
989 {
990 get_table() = list;
991 sort();
992 return *this;
993 }
994 auto& data() const { return get_table(); }
995
996 template <class Func>
997 auto for_each_prefixes_of(const Key_Type& word, Func func) const;
998
999 template <class OutIter>
1000 auto copy_all_prefixes_of(const Key_Type& word, OutIter o) const
1001 {
1002 for_each_prefixes_of(word, [&](const T& x) { *o++ = x; });
1003 return o;
1004 }
1005
1006 class Iter_Prefixes_Of {
1007 const Prefix_Multiset* set = {};
1008 Iterator it = {};
1009 Iterator last = {};
1010 const Key_Type* search_key = {};
1011 size_t len = {};
1012 bool valid = false;
1013
1014 auto advance() -> void;
1015
1016 public:
1017 using iterator_category = std::input_iterator_tag;
1018 using value_type = T;
1019 using size_type = size_t;
1020 using difference_type = ptrdiff_t;
1021 using reference = const T&;
1022 using pointer = const T*;
1023
1024 Iter_Prefixes_Of() = default;
1025 Iter_Prefixes_Of(const Prefix_Multiset& set,
1026 const Key_Type& word)
1027 : set(&set), it(set.get_table().begin()),
1028 last(set.get_table().end()), search_key(&word),
1029 valid(true)
1030 {
1031 advance();
1032 }
1033 Iter_Prefixes_Of(const Prefix_Multiset&, Key_Type&&) = delete;
1034 Iter_Prefixes_Of(Prefix_Multiset&&, const Key_Type&) = delete;
1035 Iter_Prefixes_Of(Prefix_Multiset&&, Key_Type&&) = delete;
1036
1037 auto& operator++()
1038 {
1039 ++it;
1040 advance();
1041 return *this;
1042 }
1043 auto& operator++(int)
1044 {
1045 auto old = *this;
1046 ++*this;
1047 return old;
1048 }
1049 auto& operator*() const { return *it; }
1050 auto operator->() const { return &*it; }
1051 auto operator==(const Iter_Prefixes_Of& other) const
1052 {
1053 return valid == other.valid;
1054 }
1055 auto operator!=(const Iter_Prefixes_Of& other) const
1056 {
1057 return valid != other.valid;
1058 }
1059
1060 operator bool() const { return valid; }
1061
1062 auto begin() const { return *this; }
1063 auto end() const { return Iter_Prefixes_Of(); }
1064 };
1065
1066 auto iterate_prefixes_of(const Key_Type& word) const
1067 {
1068 return Iter_Prefixes_Of(*this, word);
1069 }
1070 auto iterate_prefixes_of(Key_Type&& word) const = delete;
1071};
1072template <class T, class Key_Extr, class Key_Transform>
1073auto Prefix_Multiset<T, Key_Extr, Key_Transform>::Iter_Prefixes_Of::advance()
1074 -> void
1075{
1076 auto& extract_key = set->key_extractor();
1077 auto& transform_key = set->key_transformator();
1078 auto& table = set->get_table();
1079
1080 if (len == 0) {
1081 if (it != last) {
1082 if (transform_key(extract_key(*it)).empty())
1083 return;
1084 if (transform_key(*search_key).empty()) {
1085 valid = false;
1086 return;
1087 }
1088
1089 auto& first_letter = set->first_letter;
1090 auto& prefix_idx_with_first_letter =
1091 set->prefix_idx_with_first_letter;
1092 auto idx =
1093 first_letter.find(transform_key(*search_key)[0]);
1094 if (idx == first_letter.npos) {
1095 valid = false;
1096 return;
1097 }
1098
1099 auto first = table.begin();
1100 it = first + prefix_idx_with_first_letter[idx];
1101 last = first + prefix_idx_with_first_letter[idx + 1];
1102 ++len;
1103 }
1104 else {
1105 valid = false;
1106 return;
1107 }
1108 }
1109 for (;; ++len) {
1110 if (it != last) {
1111 if (transform_key(extract_key(*it)).size() == len)
1112 return;
1113 if (len == transform_key(*search_key).size()) {
1114 valid = false;
1115 break;
1116 }
1117 }
1118 else {
1119 valid = false;
1120 break;
1121 }
1122 tie(it, last) =
1123 equal_range(it, last, transform_key(*search_key)[len],
1124 Comparator{len, extract_key, transform_key});
1125 }
1126}
1127
1128template <class T, class Key_Extr, class Key_Transform>
1129template <class Func>
1130auto Prefix_Multiset<T, Key_Extr, Key_Transform>::for_each_prefixes_of(
1131 const Key_Type& word, Func func) const
1132{
1133 auto& extract_key = key_extractor();
1134 auto& transform_key = key_transformator();
1135 auto& table = get_table();
1136
1137 auto first = begin(table);
1138 auto last = end(table);
1139 auto it = first;
1140 for (; it != last; ++it) {
1141 auto&& k = transform_key(extract_key(*it));
1142 if (k.empty())
1143 func(*it);
1144 else
1145 break;
1146 }
1147
1148 if (it == last)
1149 return;
1150 if (transform_key(word).empty())
1151 return;
1152
1153 auto idx = first_letter.find(transform_key(word)[0]);
1154 if (idx == first_letter.npos)
1155 return;
1156 first = begin(table) + prefix_idx_with_first_letter[idx];
1157 last = begin(table) + prefix_idx_with_first_letter[idx + 1];
1158
1159 for (size_t len = 1;; ++len) {
1160 it = first;
1161 for (; it != last &&
1162 transform_key(extract_key(*it)).size() == len;
1163 ++it) {
1164 func(*it);
1165 }
1166 if (it == last)
1167 break;
1168 if (len == transform_key(word).size())
1169 break;
1170 tie(first, last) =
1171 equal_range(it, last, transform_key(word)[len],
1172 Comparator{len, extract_key, transform_key});
1173 }
1174}
1175
1176template <class CharT>
1177class Reversed_String_View {
1178 public:
1179 using Str = std::basic_string<CharT>;
1180 using traits_type = typename Str::traits_type;
1181 using value_type = typename Str::value_type;
1182 using size_type = typename Str::size_type;
1183 using const_iterator = typename Str::const_reverse_iterator;
1184
1185 private:
1186 const_iterator first = {};
1187 size_type sz = {};
1188
1189 public:
1190 Reversed_String_View() = default;
1191 explicit Reversed_String_View(const Str& s)
1192 : first(s.rbegin()), sz(s.size())
1193 {
1194 }
1195 Reversed_String_View(Str&& s) = delete;
1196 auto& operator[](size_type i) const { return first[i]; }
1197 auto size() const { return sz; }
1198 auto empty() const { return sz == 0; }
1199 auto begin() const { return first; }
1200 auto end() const { return first + sz; }
1201 auto operator<(Reversed_String_View other) const
1202 {
1203 return lexicographical_compare(begin(), end(), other.begin(),
1204 other.end(), traits_type::lt);
1205 }
1206};
1207
1208template <class CharT>
1209struct String_Reverser {
1210 auto operator()(const std::basic_string<CharT>& x) const
1211 {
1212 return Reversed_String_View<CharT>(x);
1213 }
1214 // auto operator()(T&& x) const = delete;
1215};
1216
1217template <class AffixT>
1218struct Extractor_Of_Appending_From_Affix {
1219 auto& operator()(const AffixT& a) const { return a.appending; }
1220};
1221
1222template <class T, class Key_Extr = identity>
1223using Suffix_Multiset = Prefix_Multiset<
1224 T, Key_Extr,
1225 String_Reverser<typename Prefix_Multiset<T, Key_Extr>::Char_Type>>;
1226
1227class Prefix_Table {
1228 using Prefix_Multiset_Type =
1229 Prefix_Multiset<Prefix, Extractor_Of_Appending_From_Affix<Prefix>>;
1230 using Key_Type = typename Prefix_Multiset_Type::Key_Type;
1231 using Vector_Type = typename Prefix_Multiset_Type::Vector_Type;
1232 Prefix_Multiset_Type table;
1233 Flag_Set all_cont_flags;
1234
1235 auto populate()
1236 {
1237 for (auto& x : table.data())
1238 all_cont_flags += x.cont_flags;
1239 }
1240
1241 public:
1242 Prefix_Table() = default;
1243 explicit Prefix_Table(const Vector_Type& t) : table(t) { populate(); }
1244 explicit Prefix_Table(Vector_Type&& t) : table(std::move(t))
1245 {
1246 populate();
1247 }
1248 auto& operator=(const Vector_Type& t)
1249 {
1250 table = t;
1251 populate();
1252 return *this;
1253 }
1254 auto& operator=(Vector_Type&& t)
1255 {
1256 table = std::move(t);
1257 populate();
1258 return *this;
1259 }
1260 auto begin() const { return table.data().begin(); }
1261 auto end() const { return table.data().end(); }
1262 auto size() const { return table.data().size(); }
1263
1264 auto has_continuation_flags() const
1265 {
1266 return all_cont_flags.size() != 0;
1267 }
1268 auto has_continuation_flag(char16_t flag) const
1269 {
1270 return all_cont_flags.contains(flag);
1271 }
1272 auto iterate_prefixes_of(const Key_Type& word) const
1273 {
1274 return table.iterate_prefixes_of(word);
1275 }
1276 auto iterate_prefixes_of(Key_Type&& word) const = delete;
1277};
1278
1279class Suffix_Table {
1280 using Suffix_Multiset_Type =
1281 Suffix_Multiset<Suffix, Extractor_Of_Appending_From_Affix<Suffix>>;
1282 using Key_Type = typename Suffix_Multiset_Type::Key_Type;
1283 using Vector_Type = typename Suffix_Multiset_Type::Vector_Type;
1284 Suffix_Multiset_Type table;
1285 Flag_Set all_cont_flags;
1286
1287 auto populate()
1288 {
1289 for (auto& x : table.data())
1290 all_cont_flags += x.cont_flags;
1291 }
1292
1293 public:
1294 Suffix_Table() = default;
1295 explicit Suffix_Table(const Vector_Type& t) : table(t) { populate(); }
1296 explicit Suffix_Table(Vector_Type&& t) : table(std::move(t))
1297 {
1298 populate();
1299 }
1300 auto& operator=(const Vector_Type& t)
1301 {
1302 table = t;
1303 populate();
1304 return *this;
1305 }
1306 auto& operator=(Vector_Type&& t)
1307 {
1308 table = std::move(t);
1309 populate();
1310 return *this;
1311 }
1312 auto begin() const { return table.data().begin(); }
1313 auto end() const { return table.data().end(); }
1314 auto size() const { return table.data().size(); }
1315
1316 auto has_continuation_flags() const
1317 {
1318 return all_cont_flags.size() != 0;
1319 }
1320 auto has_continuation_flag(char16_t flag) const
1321 {
1322 return all_cont_flags.contains(flag);
1323 }
1324 auto iterate_suffixes_of(const Key_Type& word) const
1325 {
1326 return table.iterate_prefixes_of(word);
1327 }
1328 auto iterate_suffixes_of(Key_Type&& word) const = delete;
1329};
1330
1335class String_Pair {
1336 using Str = std::string;
1337 using Str_View = std::string_view;
1338 size_t i = 0;
1339 Str s;
1340
1341 public:
1342 String_Pair() = default;
1343 template <class Str1>
1344 explicit String_Pair(Str1&& str, size_t i)
1345 : i(i), s(std::forward<Str1>(str))
1346 {
1347 if (i > s.size())
1348 throw std::out_of_range("word split is too long");
1349 }
1350
1351 template <class Str1, class Str2,
1352 class = std::enable_if_t<
1353 std::is_same<std::remove_reference_t<Str1>, Str>::value &&
1354 std::is_same<std::remove_reference_t<Str2>, Str>::value>>
1355 String_Pair(Str1&& first, Str2&& second)
1356 : i(first.size()) /* must be init before s, before we move
1357 from first */
1358 ,
1359 s(std::forward<Str1>(first) + std::forward<Str2>(second))
1360
1361 {
1362 }
1363 auto first() const { return Str_View(s).substr(0, i); }
1364 auto second() const { return Str_View(s).substr(i); }
1365 auto first(Str_View x)
1366 {
1367 s.replace(0, i, x);
1368 i = x.size();
1369 }
1370 auto second(Str_View x) { s.replace(i, s.npos, x); }
1371 auto& str() const { return s; }
1372 auto idx() const { return i; }
1373};
1374struct Compound_Pattern {
1375 String_Pair begin_end_chars;
1376 std::string replacement;
1377 char16_t first_word_flag = 0;
1378 char16_t second_word_flag = 0;
1379 bool match_first_only_unaffixed_or_zero_affixed = false;
1380};
1381
1382class Compound_Rule_Table {
1383 std::vector<std::u16string> rules;
1384 Flag_Set all_flags;
1385
1386 auto fill_all_flags() -> void;
1387
1388 public:
1389 Compound_Rule_Table() = default;
1390 explicit Compound_Rule_Table(const std::vector<std::u16string>& tbl)
1391 : rules(tbl)
1392 {
1393 fill_all_flags();
1394 }
1395 explicit Compound_Rule_Table(std::vector<std::u16string>&& tbl)
1396 : rules(std::move(tbl))
1397 {
1398 fill_all_flags();
1399 }
1400 auto& operator=(const std::vector<std::u16string>& tbl)
1401 {
1402 rules = tbl;
1403 fill_all_flags();
1404 return *this;
1405 }
1406 auto& operator=(std::vector<std::u16string>&& tbl)
1407 {
1408 rules = std::move(tbl);
1409 fill_all_flags();
1410 return *this;
1411 }
1412 auto empty() const { return rules.empty(); }
1413 auto has_any_of_flags(const Flag_Set& f) const -> bool;
1414 auto match_any_rule(const std::vector<const Flag_Set*>& data) const
1415 -> bool;
1416};
1417auto inline Compound_Rule_Table::fill_all_flags() -> void
1418{
1419 for (auto& f : rules) {
1420 all_flags += f;
1421 }
1422 all_flags.erase(u'?');
1423 all_flags.erase(u'*');
1424}
1425
1426auto inline Compound_Rule_Table::has_any_of_flags(const Flag_Set& f) const
1427 -> bool
1428{
1429 using std::begin;
1430 using std::end;
1431 struct Out_Iter_One_Bool {
1432 bool* value = nullptr;
1433 auto& operator++() { return *this; }
1434 auto& operator++(int) { return *this; }
1435 auto& operator*() { return *this; }
1436 auto& operator=(char16_t)
1437 {
1438 *value = true;
1439 return *this;
1440 }
1441 };
1442 auto has_intersection = false;
1443 auto out_it = Out_Iter_One_Bool{&has_intersection};
1444 std::set_intersection(begin(all_flags), end(all_flags), begin(f),
1445 end(f), out_it);
1446 return has_intersection;
1447}
1448
1449template <class DataIter, class PatternIter, class FuncEq = std::equal_to<>>
1450auto match_simple_regex(DataIter data_first, DataIter data_last,
1451 PatternIter pat_first, PatternIter pat_last,
1452 FuncEq eq = FuncEq())
1453{
1454 auto s = std::stack<std::pair<DataIter, PatternIter>>();
1455 s.emplace(data_first, pat_first);
1456 auto data_it = DataIter();
1457 auto pat_it = PatternIter();
1458 while (!s.empty()) {
1459 std::tie(data_it, pat_it) = s.top();
1460 s.pop();
1461 if (pat_it == pat_last) {
1462 if (data_it == data_last)
1463 return true;
1464 else
1465 return false;
1466 }
1467 auto node_type = *pat_it;
1468 if (pat_it + 1 == pat_last)
1469 node_type = 0;
1470 else
1471 node_type = *(pat_it + 1);
1472 switch (node_type) {
1473 case '?':
1474 s.emplace(data_it, pat_it + 2);
1475 if (data_it != data_last && eq(*data_it, *pat_it))
1476 s.emplace(data_it + 1, pat_it + 2);
1477 break;
1478 case '*':
1479 s.emplace(data_it, pat_it + 2);
1480 if (data_it != data_last && eq(*data_it, *pat_it))
1481 s.emplace(data_it + 1, pat_it);
1482
1483 break;
1484 default:
1485 if (data_it != data_last && eq(*data_it, *pat_it))
1486 s.emplace(data_it + 1, pat_it + 1);
1487 break;
1488 }
1489 }
1490 return false;
1491}
1492
1493template <class DataRange, class PatternRange, class FuncEq = std::equal_to<>>
1494auto match_simple_regex(const DataRange& data, const PatternRange& pattern,
1495 FuncEq eq = FuncEq())
1496{
1497 using namespace std;
1498 return match_simple_regex(begin(data), end(data), begin(pattern),
1499 end(pattern), eq);
1500}
1501
1502auto inline match_compund_rule(const std::vector<const Flag_Set*>& words_data,
1503 const std::u16string& pattern)
1504{
1505 return match_simple_regex(
1506 words_data, pattern,
1507 [](const Flag_Set* d, char16_t p) { return d->contains(p); });
1508}
1509
1510auto inline Compound_Rule_Table::match_any_rule(
1511 const std::vector<const Flag_Set*>& data) const -> bool
1512{
1513 return any_of(begin(rules), end(rules), [&](const std::u16string& p) {
1514 return match_compund_rule(data, p);
1515 });
1516}
1517
1518using List_Strings = std::vector<std::string>;
1519
1520class Replacement_Table {
1521 public:
1522 using Str = std::string;
1523 using Table_Str = std::vector<std::pair<Str, Str>>;
1524 using iterator = typename Table_Str::iterator;
1525 using const_iterator = typename Table_Str::const_iterator;
1526
1527 private:
1528 Table_Str table;
1529 size_t whole_word_reps_last_idx = 0;
1530 size_t start_word_reps_last_idx = 0;
1531 size_t end_word_reps_last_idx = 0;
1532
1533 auto order_entries() -> void;
1534
1535 public:
1536 Replacement_Table() = default;
1537 explicit Replacement_Table(const Table_Str& v) : table(v)
1538 {
1539 order_entries();
1540 }
1541 explicit Replacement_Table(Table_Str&& v) : table(std::move(v))
1542 {
1543 order_entries();
1544 }
1545
1546 auto& operator=(const Table_Str& v)
1547 {
1548 table = v;
1549 order_entries();
1550 return *this;
1551 }
1552
1553 auto& operator=(Table_Str&& v)
1554 {
1555 table = std::move(v);
1556 order_entries();
1557 return *this;
1558 }
1559
1560 auto whole_word_replacements() const -> Subrange<const_iterator>
1561 {
1562 return {begin(table), begin(table) + whole_word_reps_last_idx};
1563 }
1564 auto start_word_replacements() const -> Subrange<const_iterator>
1565 {
1566 return {begin(table) + whole_word_reps_last_idx,
1567 begin(table) + start_word_reps_last_idx};
1568 }
1569 auto end_word_replacements() const -> Subrange<const_iterator>
1570 {
1571 return {begin(table) + start_word_reps_last_idx,
1572 begin(table) + end_word_reps_last_idx};
1573 }
1574 auto any_place_replacements() const -> Subrange<const_iterator>
1575 {
1576 return {begin(table) + end_word_reps_last_idx, end(table)};
1577 }
1578};
1579auto inline Replacement_Table::order_entries() -> void
1580{
1581 auto it = remove_if(begin(table), end(table), [](auto& p) {
1582 auto& s = p.first;
1583 return s.empty() ||
1584 (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
1585 });
1586 table.erase(it, end(table));
1587
1588 auto is_start_word_pat = [=](auto& x) { return x.first[0] == '^'; };
1589 auto is_end_word_pat = [=](auto& x) { return x.first.back() == '$'; };
1590
1591 auto start_word_reps_last =
1592 partition(begin(table), end(table), is_start_word_pat);
1593 start_word_reps_last_idx = start_word_reps_last - begin(table);
1594 for_each(begin(table), start_word_reps_last,
1595 [](auto& e) { e.first.erase(0, 1); });
1596
1597 auto whole_word_reps_last =
1598 partition(begin(table), start_word_reps_last, is_end_word_pat);
1599 whole_word_reps_last_idx = whole_word_reps_last - begin(table);
1600 for_each(begin(table), whole_word_reps_last,
1601 [](auto& e) { e.first.pop_back(); });
1602
1603 auto end_word_reps_last =
1604 partition(start_word_reps_last, end(table), is_end_word_pat);
1605 end_word_reps_last_idx = end_word_reps_last - begin(table);
1606 for_each(start_word_reps_last, end_word_reps_last,
1607 [](auto& e) { e.first.pop_back(); });
1608}
1609
1610struct Similarity_Group {
1611 using Str = std::string;
1612 Str chars;
1613 std::vector<Str> strings;
1614
1615 auto parse(const Str& s) -> void;
1616 Similarity_Group() = default;
1617 explicit Similarity_Group(const Str& s) { parse(s); }
1618 auto& operator=(const Str& s)
1619 {
1620 parse(s);
1621 return *this;
1622 }
1623};
1624auto inline Similarity_Group::parse(const Str& s) -> void
1625{
1626 auto i = size_t(0);
1627 for (;;) {
1628 auto j = s.find('(', i);
1629 chars.append(s, i, j - i);
1630 if (j == s.npos)
1631 break;
1632 i = j + 1;
1633 j = s.find(')', i);
1634 if (j == s.npos)
1635 break;
1636 auto len = j - i;
1637 if (len == 1)
1638 chars += s[i];
1639 else if (len > 1)
1640 strings.push_back(s.substr(i, len));
1641 i = j + 1;
1642 }
1643}
1644
1645class Phonetic_Table {
1646 using Char_Type = char; // not aware of unicode
1647 using Str = std::string;
1648 using Pair_Str = std::pair<Str, Str>;
1649
1650 struct Phonet_Match_Result {
1651 size_t count_matched = 0;
1652 size_t go_back_before_replace = 0;
1653 size_t priority = 5;
1654 bool go_back_after_replace = false;
1655 bool treat_next_as_begin = false;
1656 operator bool() { return count_matched; }
1657 };
1658
1659 std::vector<std::pair<Str, Str>> table;
1660 auto order() -> void;
1661 auto static match(const Str& data, size_t i, const Str& pattern,
1662 bool at_begin) -> Phonet_Match_Result;
1663
1664 public:
1665 Phonetic_Table() = default;
1666 explicit Phonetic_Table(const std::vector<Pair_Str>& v) : table(v)
1667 {
1668 order();
1669 }
1670 explicit Phonetic_Table(std::vector<Pair_Str>&& v) : table(std::move(v))
1671 {
1672 order();
1673 }
1674 auto& operator=(const std::vector<Pair_Str>& v)
1675 {
1676 table = v;
1677 order();
1678 return *this;
1679 }
1680 auto& operator=(std::vector<Pair_Str>&& v)
1681 {
1682 table = std::move(v);
1683 order();
1684 return *this;
1685 }
1686 auto replace(Str& word) const -> bool;
1687};
1688
1689auto inline Phonetic_Table::order() -> void
1690{
1691 stable_sort(begin(table), end(table), [](auto& pair1, auto& pair2) {
1692 if (pair2.first.empty())
1693 return false;
1694 if (pair1.first.empty())
1695 return true;
1696 return pair1.first[0] < pair2.first[0];
1697 });
1698 auto it = find_if_not(begin(table), end(table),
1699 [](auto& p) { return p.first.empty(); });
1700 table.erase(begin(table), it);
1701 for (auto& r : table) {
1702 if (r.second == "_")
1703 r.second.clear();
1704 }
1705}
1706
1707auto inline Phonetic_Table::match(const Str& data, size_t i, const Str& pattern,
1708 bool at_begin) -> Phonet_Match_Result
1709{
1710 auto ret = Phonet_Match_Result();
1711 auto j = pattern.find_first_of("(<-0123456789^$");
1712 if (j == pattern.npos)
1713 j = pattern.size();
1714 if (data.compare(i, j, pattern, 0, j) == 0)
1715 ret.count_matched = j;
1716 else
1717 return {};
1718 if (j == pattern.size())
1719 return ret;
1720 if (pattern[j] == '(') {
1721 auto k = pattern.find(')', j);
1722 if (k == pattern.npos)
1723 return {}; // bad rule
1724 auto x = std::char_traits<Char_Type>::find(
1725 &pattern[j + 1], k - (j + 1), data[i + j]);
1726 if (!x)
1727 return {};
1728 j = k + 1;
1729 ret.count_matched += 1;
1730 }
1731 if (j == pattern.size())
1732 return ret;
1733 if (pattern[j] == '<') {
1734 ret.go_back_after_replace = true;
1735 ++j;
1736 }
1737 auto k = pattern.find_first_not_of('-', j);
1738 if (k == pattern.npos) {
1739 k = pattern.size();
1740 ret.go_back_before_replace = k - j;
1741 if (ret.go_back_before_replace >= ret.count_matched)
1742 return {}; // bad rule
1743 return ret;
1744 }
1745 else {
1746 ret.go_back_before_replace = k - j;
1747 if (ret.go_back_before_replace >= ret.count_matched)
1748 return {}; // bad rule
1749 }
1750 j = k;
1751 if (pattern[j] >= '0' && pattern[j] <= '9') {
1752 ret.priority = pattern[j] - '0';
1753 ++j;
1754 }
1755 if (j == pattern.size())
1756 return ret;
1757 if (pattern[j] == '^') {
1758 if (!at_begin)
1759 return {};
1760 ++j;
1761 }
1762 if (j == pattern.size())
1763 return ret;
1764 if (pattern[j] == '^') {
1765 ret.treat_next_as_begin = true;
1766 ++j;
1767 }
1768 if (j == pattern.size())
1769 return ret;
1770 if (pattern[j] != '$')
1771 return {}; // bad rule, no other char is allowed at this point
1772 if (i + ret.count_matched == data.size())
1773 return ret;
1774 return {};
1775}
1776
1777auto inline Phonetic_Table::replace(Str& word) const -> bool
1778{
1779 struct Cmp {
1780 auto operator()(Char_Type c, const Pair_Str& s)
1781 {
1782 return c < s.first[0];
1783 }
1784 auto operator()(const Pair_Str& s, Char_Type c)
1785 {
1786 return s.first[0] < c;
1787 }
1788 };
1789 if (table.empty())
1790 return false;
1791 auto ret = false;
1792 auto treat_next_as_begin = true;
1793 size_t count_go_backs_after_replace = 0; // avoid infinite loop
1794 for (size_t i = 0; i != word.size(); ++i) {
1795 auto rules =
1796 equal_range(begin(table), end(table), word[i], Cmp());
1797 for (auto& r : Subrange(rules)) {
1798 auto rule = &r;
1799 auto m1 = match(word, i, r.first, treat_next_as_begin);
1800 if (!m1)
1801 continue;
1802 if (!m1.go_back_before_replace) {
1803 auto j = i + m1.count_matched - 1;
1804 auto rules2 = equal_range(
1805 begin(table), end(table), word[j], Cmp());
1806 for (auto& r2 : Subrange(rules2)) {
1807 auto m2 =
1808 match(word, j, r2.first, false);
1809 if (m2 && m2.priority >= m1.priority) {
1810 i = j;
1811 rule = &r2;
1812 m1 = m2;
1813 break;
1814 }
1815 }
1816 }
1817 word.replace(
1818 i, m1.count_matched - m1.go_back_before_replace,
1819 rule->second);
1820 treat_next_as_begin = m1.treat_next_as_begin;
1821 if (m1.go_back_after_replace &&
1822 count_go_backs_after_replace < 100) {
1823 count_go_backs_after_replace++;
1824 }
1825 else {
1826 i += rule->second.size();
1827 }
1828 --i;
1829 ret = true;
1830 break;
1831 }
1832 }
1833 return ret;
1834}
1835NUSPELL_END_INLINE_NAMESPACE
1836} // namespace nuspell
1837#endif // NUSPELL_STRUCTURES_HXX
Library main namespace.
Definition aff_data.cxx:33