Nuspell
spell checker
structures.hxx
1 /* Copyright 2018-2023 Dimitrij Mijoski
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #ifndef NUSPELL_STRUCTURES_HXX
20 #define NUSPELL_STRUCTURES_HXX
21 
22 #include "unicode.hxx"
23 
24 #include <algorithm>
25 #include <cmath>
26 #include <forward_list>
27 #include <functional>
28 #include <iterator>
29 #include <stack>
30 #include <stdexcept>
31 #include <string>
32 #include <string_view>
33 #include <utility>
34 #include <vector>
35 
36 namespace nuspell {
37 NUSPELL_BEGIN_INLINE_NAMESPACE
38 
39 template <class It>
40 class Subrange {
41  using Iter_Category =
42  typename std::iterator_traits<It>::iterator_category;
43  It a = {};
44  It b = {};
45 
46  public:
47  Subrange() = default;
48  Subrange(std::pair<It, It> p) : a(p.first), b(p.second) {}
49  Subrange(It first, It last) : a(first), b(last) {}
50  template <class Range>
51  Subrange(Range& r) : a(r.begin()), b(r.end())
52  {
53  }
54  auto begin() const -> It { return a; }
55  auto end() const -> It { return b; }
56 };
57 // CTAD
58 template <class Range>
60 template <class Range>
62 
70 template <class CharT>
71 class String_Set {
72  private:
73  std::basic_string<CharT> d;
74  auto sort_uniq() -> void
75  {
76  auto first = begin();
77  auto last = end();
78  using t = traits_type;
79  sort(first, last, t::lt);
80  d.erase(unique(first, last, t::eq), last);
81  }
82  struct Char_Traits_Less_Than {
83  auto operator()(CharT a, CharT b) const noexcept
84  {
85  return traits_type::lt(a, b);
86  }
87  };
88 
89  public:
90  using Str = std::basic_string<CharT>;
91  using traits_type = typename Str::traits_type;
92 
93  using key_type = typename Str::value_type;
94  using key_compare = Char_Traits_Less_Than;
95  using value_type = typename Str::value_type;
96  using value_compare = key_compare;
97  using allocator_type = typename Str::allocator_type;
98  using pointer = typename Str::pointer;
99  using const_pointer = typename Str::const_pointer;
100  using reference = typename Str::reference;
101  using const_reference = typename Str::const_reference;
102  using size_type = typename Str::size_type;
103  using difference_type = typename Str::difference_type;
104  using iterator = typename Str::iterator;
105  using const_iterator = typename Str::const_iterator;
106  using reverse_iterator = typename Str::reverse_iterator;
107  using const_reverse_iterator = typename Str::const_reverse_iterator;
108 
109  String_Set() = default;
110  explicit String_Set(const Str& s) : d(s) { sort_uniq(); }
111  explicit String_Set(Str&& s) : d(move(s)) { sort_uniq(); }
112  explicit String_Set(const CharT* s) : d(s) { sort_uniq(); }
113  template <class InputIterator>
114  String_Set(InputIterator first, InputIterator last) : d(first, last)
115  {
116  sort_uniq();
117  }
118  String_Set(std::initializer_list<value_type> il) : d(il)
119  {
120  sort_uniq();
121  }
122 
123  auto& operator=(const Str& s)
124  {
125  d = s;
126  sort_uniq();
127  return *this;
128  }
129  auto& operator=(Str&& s)
130  {
131  d = move(s);
132  sort_uniq();
133  return *this;
134  }
135  auto& operator=(std::initializer_list<value_type> il)
136  {
137  d = il;
138  sort_uniq();
139  return *this;
140  }
141  auto& operator=(const CharT* s)
142  {
143  d = s;
144  sort_uniq();
145  return *this;
146  }
147 
148  // non standard underlying storage access:
149  auto data() const noexcept -> const_pointer { return d.data(); }
150  operator const Str&() const noexcept { return d; }
151  auto& str() const noexcept { return d; }
152 
153  // iterators:
154  iterator begin() noexcept { return d.begin(); }
155  const_iterator begin() const noexcept { return d.begin(); }
156  iterator end() noexcept { return d.end(); }
157  const_iterator end() const noexcept { return d.end(); }
158 
159  reverse_iterator rbegin() noexcept { return d.rbegin(); }
160  const_reverse_iterator rbegin() const noexcept { return d.rbegin(); }
161  reverse_iterator rend() noexcept { return d.rend(); }
162  const_reverse_iterator rend() const noexcept { return d.rend(); }
163 
164  const_iterator cbegin() const noexcept { return d.cbegin(); }
165  const_iterator cend() const noexcept { return d.cend(); }
166  const_reverse_iterator crbegin() const noexcept { return d.crbegin(); }
167  const_reverse_iterator crend() const noexcept { return d.crend(); }
168 
169  // capacity:
170  bool empty() const noexcept { return d.empty(); }
171  size_type size() const noexcept { return d.size(); }
172  size_type max_size() const noexcept { return d.max_size(); }
173 
174  // modifiers:
175  std::pair<iterator, bool> insert(value_type x)
176  {
177  auto it = lower_bound(x);
178  if (it != end() && *it == x) {
179  return {it, false};
180  }
181  auto ret = d.insert(it, x);
182  return {ret, true};
183  }
184  iterator insert(iterator hint, value_type x)
185  {
186  if (hint == end() || traits_type::lt(x, *hint)) {
187  if (hint == begin() ||
188  traits_type::lt(*(hint - 1), x)) {
189  return d.insert(hint, x);
190  }
191  }
192  return insert(x).first;
193  }
194 
195  // iterator insert(const_iterator hint, value_type&& x);
196  template <class InputIterator>
197  void insert(InputIterator first, InputIterator last)
198  {
199  d.insert(end(), first, last);
200  sort_uniq();
201  }
202  void insert(std::initializer_list<value_type> il)
203  {
204  d.insert(end(), il);
205  sort_uniq();
206  }
207  template <class... Args>
208  std::pair<iterator, bool> emplace(Args&&... args)
209  {
210  return insert(CharT(args...));
211  }
212 
213  template <class... Args>
214  iterator emplace_hint(iterator hint, Args&&... args)
215  {
216  return insert(hint, CharT(args...));
217  }
218 
219  iterator erase(iterator position) { return d.erase(position); }
220  size_type erase(key_type x)
221  {
222  auto i = d.find(x);
223  if (i != d.npos) {
224  d.erase(i, 1);
225  return true;
226  }
227  return false;
228  }
229  iterator erase(iterator first, iterator last)
230  {
231  return d.erase(first, last);
232  }
233  void swap(String_Set& s) { d.swap(s.d); }
234  void clear() noexcept { d.clear(); }
235 
236  // non standrd modifiers:
237  auto insert(const Str& s) -> void
238  {
239  d += s;
240  sort_uniq();
241  }
242  auto& operator+=(const Str& s)
243  {
244  insert(s);
245  return *this;
246  }
247 
248  // observers:
249  key_compare key_comp() const { return Char_Traits_Less_Than(); }
250  value_compare value_comp() const { return key_comp(); }
251 
252  // set operations:
253  private:
254  auto lookup(key_type x) const
255  {
256  auto i = d.find(x);
257  if (i == d.npos)
258  i = d.size();
259  return i;
260  }
261 
262  public:
263  iterator find(key_type x) { return begin() + lookup(x); }
264  const_iterator find(key_type x) const { return begin() + lookup(x); }
265  size_type count(key_type x) const { return d.find(x) != d.npos; }
266 
267  iterator lower_bound(key_type x)
268  {
269  return std::lower_bound(begin(), end(), x, traits_type::lt);
270  }
271 
272  const_iterator lower_bound(key_type x) const
273  {
274  return std::lower_bound(begin(), end(), x, traits_type::lt);
275  }
276  iterator upper_bound(key_type x)
277  {
278  return std::upper_bound(begin(), end(), x, traits_type::lt);
279  }
280 
281  const_iterator upper_bound(key_type x) const
282  {
283  return std::upper_bound(begin(), end(), x, traits_type::lt);
284  }
285  std::pair<iterator, iterator> equal_range(key_type x)
286  {
287  return std::equal_range(begin(), end(), x, traits_type::lt);
288  }
289 
290  std::pair<const_iterator, const_iterator> equal_range(key_type x) const
291  {
292  return std::equal_range(begin(), end(), x, traits_type::lt);
293  }
294 
295  // non standard set operations:
296  bool contains(key_type x) const { return count(x); }
297 
298  // compare
299  bool operator<(const String_Set& rhs) const { return d < rhs.d; }
300  bool operator<=(const String_Set& rhs) const { return d <= rhs.d; }
301  bool operator==(const String_Set& rhs) const { return d == rhs.d; }
302  bool operator!=(const String_Set& rhs) const { return d != rhs.d; }
303  bool operator>=(const String_Set& rhs) const { return d >= rhs.d; }
304  bool operator>(const String_Set& rhs) const { return d > rhs.d; }
305 };
306 
307 template <class CharT>
308 auto swap(String_Set<CharT>& a, String_Set<CharT>& b)
309 {
310  a.swap(b);
311 }
312 
313 using Flag_Set = String_Set<char16_t>;
314 
316  public:
317  using Str = std::string;
318  using Str_View = std::string_view;
319  using Pair_Str = std::pair<Str, Str>;
320  using Table_Pairs = std::vector<Pair_Str>;
321 
322  private:
323  Table_Pairs table;
324  auto sort_uniq() -> void;
325  auto find_match(Str_View s) const;
326 
327  public:
328  Substr_Replacer() = default;
329  explicit Substr_Replacer(const Table_Pairs& v) : table(v)
330  {
331  sort_uniq();
332  }
333  explicit Substr_Replacer(Table_Pairs&& v) : table(move(v))
334  {
335  sort_uniq();
336  }
337 
338  auto& operator=(const Table_Pairs& v)
339  {
340  table = v;
341  sort_uniq();
342  return *this;
343  }
344  auto& operator=(Table_Pairs&& v)
345  {
346  table = move(v);
347  sort_uniq();
348  return *this;
349  }
350 
351  auto replace(Str& s) const -> Str&;
352  auto replace_copy(Str s) const -> Str
353  {
354  replace(s);
355  return s;
356  }
357 };
358 
359 auto inline Substr_Replacer::sort_uniq() -> void
360 {
361  auto first = begin(table);
362  auto last = end(table);
363  sort(first, last, [](auto& a, auto& b) { return a.first < b.first; });
364  auto it = unique(first, last,
365  [](auto& a, auto& b) { return a.first == b.first; });
366  table.erase(it, last);
367 
368  // remove empty key ""
369  if (!table.empty() && table.front().first.empty())
370  table.erase(begin(table));
371 }
372 
373 auto inline Substr_Replacer::find_match(Str_View s) const
374 {
375  auto& t = table;
376  struct Comparer_Str_Rep {
377  auto static cmp_prefix_of(const Str& p, Str_View of)
378  {
379  return p.compare(0, p.npos, of.data(),
380  std::min(p.size(), of.size()));
381  }
382  auto operator()(const Pair_Str& a, Str_View b)
383  {
384  return cmp_prefix_of(a.first, b) < 0;
385  }
386  auto operator()(Str_View a, const Pair_Str& b)
387  {
388  return cmp_prefix_of(b.first, a) > 0;
389  }
390  auto static eq(const Pair_Str& a, Str_View b)
391  {
392  return cmp_prefix_of(a.first, b) == 0;
393  }
394  };
395  Comparer_Str_Rep csr;
396  auto it = begin(t);
397  auto last_match = end(t);
398  for (;;) {
399  auto it2 = upper_bound(it, end(t), s, csr);
400  if (it2 == it) {
401  // not found, s is smaller that the range
402  break;
403  }
404  --it2;
405  if (csr.eq(*it2, s)) {
406  // Match found. Try another search maybe for
407  // longer.
408  last_match = it2;
409  it = ++it2;
410  }
411  else {
412  // not found, s is greater that the range
413  break;
414  }
415  }
416  return last_match;
417 }
418 
419 auto inline Substr_Replacer::replace(Str& s) const -> Str&
420 {
421 
422  if (table.empty())
423  return s;
424  for (size_t i = 0; i < s.size(); /*no increment here*/) {
425  auto substr = Str_View(&s[i], s.size() - i);
426  auto it = find_match(substr);
427  if (it != end(table)) {
428  auto& match = *it;
429  // match found. match.first is the found string,
430  // match.second is the replacement.
431  s.replace(i, match.first.size(), match.second);
432  i += match.second.size();
433  continue;
434  }
435  ++i;
436  }
437  return s;
438 }
439 
440 class Break_Table {
441  public:
442  using Str = std::string;
443  using Table_Str = std::vector<Str>;
444  using iterator = typename Table_Str::iterator;
445  using const_iterator = typename Table_Str::const_iterator;
446 
447  private:
448  Table_Str table;
449  size_t start_word_breaks_last_idx = 0;
450  size_t end_word_breaks_last_idx = 0;
451 
452  auto order_entries() -> void;
453 
454  public:
455  Break_Table() = default;
456  explicit Break_Table(const Table_Str& v) : table(v) { order_entries(); }
457  explicit Break_Table(Table_Str&& v) : table(move(v))
458  {
459  order_entries();
460  }
461 
462  auto& operator=(const Table_Str& v)
463  {
464  table = v;
465  order_entries();
466  return *this;
467  }
468 
469  auto& operator=(Table_Str&& v)
470  {
471  table = move(v);
472  order_entries();
473  return *this;
474  }
475 
476  auto start_word_breaks() const -> Subrange<const_iterator>
477  {
478  return {begin(table),
479  begin(table) + start_word_breaks_last_idx};
480  }
481  auto end_word_breaks() const -> Subrange<const_iterator>
482  {
483  return {begin(table) + start_word_breaks_last_idx,
484  begin(table) + end_word_breaks_last_idx};
485  }
486  auto middle_word_breaks() const -> Subrange<const_iterator>
487  {
488  return {begin(table) + end_word_breaks_last_idx, end(table)};
489  }
490 };
491 auto inline Break_Table::order_entries() -> void
492 {
493  auto it = remove_if(begin(table), end(table), [](auto& s) {
494  return s.empty() ||
495  (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
496  });
497  table.erase(it, end(table));
498 
499  auto is_start_word_break = [=](auto& x) { return x[0] == '^'; };
500  auto is_end_word_break = [=](auto& x) { return x.back() == '$'; };
501  auto start_word_breaks_last =
502  partition(begin(table), end(table), is_start_word_break);
503  start_word_breaks_last_idx = start_word_breaks_last - begin(table);
504 
505  for_each(begin(table), start_word_breaks_last,
506  [](auto& e) { e.erase(0, 1); });
507 
508  auto end_word_breaks_last =
509  partition(start_word_breaks_last, end(table), is_end_word_break);
510  end_word_breaks_last_idx = end_word_breaks_last - begin(table);
511 
512  for_each(start_word_breaks_last, end_word_breaks_last,
513  [](auto& e) { e.pop_back(); });
514 }
515 
516 struct identity {
517  template <class T>
518  constexpr auto operator()(T&& t) const noexcept -> T&&
519  {
520  return std::forward<T>(t);
521  }
522 };
523 
524 template <class Key, class T>
526  using bucket_type = std::forward_list<std::pair<Key, T>>;
527  static constexpr float max_load_fact = 7.0 / 8.0;
528  std::vector<bucket_type> data;
529  size_t sz = 0;
530  size_t max_load_factor_capacity = 0;
531 
532  public:
533  using key_type = Key;
534  using mapped_type = T;
535  using value_type = std::pair<Key, T>;
536  using size_type = std::size_t;
537  using difference_type = std::ptrdiff_t;
538  using hasher = std::hash<Key>;
539  using reference = value_type&;
540  using const_reference = const value_type&;
541  using pointer = typename bucket_type::pointer;
542  using const_pointer = typename bucket_type::const_pointer;
543  using local_iterator = typename bucket_type::iterator;
544  using local_const_iterator = typename bucket_type::const_iterator;
545 
546  Hash_Multimap() = default;
547 
548  auto size() const noexcept { return sz; }
549  auto empty() const noexcept { return size() == 0; }
550 
551  auto rehash(size_t count)
552  {
553  if (empty()) {
554  size_t capacity = 16;
555  while (capacity <= count)
556  capacity <<= 1;
557  data.resize(capacity);
558  max_load_factor_capacity =
559  std::ceil(capacity * max_load_fact);
560  return;
561  }
562  if (count < size() / max_load_fact)
563  count = size() / max_load_fact;
564  auto n = Hash_Multimap();
565  n.rehash(count);
566  for (auto& b : data) {
567  for (auto& x : b) {
568  n.insert(std::move(x));
569  }
570  }
571  data.swap(n.data);
572  sz = n.sz;
573  max_load_factor_capacity = n.max_load_factor_capacity;
574  }
575 
576  auto reserve(size_t count) -> void
577  {
578  rehash(std::ceil(count / max_load_fact));
579  }
580 
581  private:
582  auto prepare_for_insert(const key_type& key)
583  -> std::pair<bucket_type&, local_iterator>
584  {
585  auto hash = hasher();
586  if (sz == max_load_factor_capacity) {
587  reserve(sz + 1);
588  }
589  auto h = hash(key);
590  auto h_mod = h & (data.size() - 1);
591  auto& bucket = data[h_mod];
592  auto prev = bucket.before_begin();
593  auto curr = begin(bucket);
594  // find first entry with same key
595  for (; curr != end(bucket); prev = curr++) {
596  if (curr->first == key)
597  break;
598  }
599  // find last entry with same key
600  for (; curr != end(bucket); prev = curr++) {
601  if (curr->first != key)
602  break;
603  }
604  ++sz;
605  return {bucket, prev};
606  }
607 
608  public:
609  auto insert(value_type&& value)
610  {
611  auto& key = value.first;
612  auto [bucket, it] = prepare_for_insert(key);
613  return bucket.insert_after(it, move(value));
614  }
615  template <class... Args>
616  auto emplace(Args&&... a)
617  {
618  auto node = bucket_type();
619  auto& val = node.emplace_front(std::forward<Args>(a)...);
620  auto& key = val.first;
621  auto [bucket, it] = prepare_for_insert(key);
622  bucket.splice_after(it, node, node.before_begin());
623  return next(it);
624  }
625 
626  auto equal_range(const key_type& key) const
627  -> std::pair<local_const_iterator, local_const_iterator>
628  {
629  auto hash = hasher();
630  if (data.empty())
631  return {};
632  auto h = hash(key);
633  auto h_mod = h & (data.size() - 1);
634  auto& bucket = data[h_mod];
635  auto eq_key = [&](auto& x) { return key == x.first; };
636  auto first = std::find_if(begin(bucket), end(bucket), eq_key);
637  if (first == end(bucket))
638  return {first, first}; // ret empty
639  auto last = std::find_if_not(next(first), end(bucket), eq_key);
640  return {first, last};
641  }
642 
643  auto bucket_count() const -> size_type { return data.size(); }
644  auto bucket_data(size_type i) const { return Subrange(data[i]); }
645 };
646 
647 struct Condition_Exception : public std::runtime_error {
648  using std::runtime_error::runtime_error;
649 };
650 
655 class Condition {
656  using Str = std::string;
657  using Str_View = std::string_view;
658 
659  Str cond;
660  size_t num_cp = 0;
661 
662  auto construct() -> void;
663 
664  public:
665  Condition() = default;
666  explicit Condition(const Str& condition) : cond(condition)
667  {
668  construct();
669  }
670  explicit Condition(Str&& condition) : cond(move(condition))
671  {
672  construct();
673  }
674  explicit Condition(const char* condition) : cond(condition)
675  {
676  construct();
677  }
678  auto& operator=(const Str& condition)
679  {
680  cond = condition;
681  num_cp = 0;
682  construct();
683  return *this;
684  }
685  auto& operator=(Str&& condition)
686  {
687  cond = std::move(condition);
688  num_cp = 0;
689  construct();
690  return *this;
691  }
692  auto& operator=(const char* condition)
693  {
694  cond = condition;
695  num_cp = 0;
696  construct();
697  return *this;
698  }
699  auto& str() const { return cond; }
700  auto match_prefix(Str_View s) const -> bool;
701  auto match_suffix(Str_View s) const
702  {
703  auto cp_cnt = size_t(0);
704  auto i = size(s);
705  for (; cp_cnt != num_cp && i != 0; ++cp_cnt) {
706  valid_u8_reverse_index(s, i);
707  }
708  if (cp_cnt != num_cp)
709  return false;
710  return match_prefix(s.substr(i));
711  }
712 };
713 auto inline Condition::construct() -> void
714 {
715  for (size_t i = 0; i != size(cond);) {
716  size_t j = cond.find_first_of("[].", i);
717  if (j == cond.npos)
718  j = size(cond);
719  while (i != j) {
720  valid_u8_advance_index(cond, i);
721  ++num_cp;
722  }
723  if (i == size(cond))
724  break;
725  if (cond[i] == '.') {
726  ++num_cp;
727  ++i;
728  continue;
729  }
730  else if (cond[i] == ']') {
731  auto what =
732  "closing bracket has no matching opening bracket";
733  throw Condition_Exception(what);
734  }
735  else if (cond[i] == '[') {
736  ++i;
737  if (i == size(cond)) {
738  auto what = "opening bracket has no matching "
739  "closing bracket";
740  throw Condition_Exception(what);
741  }
742  if (cond[i] == '^')
743  ++i;
744  j = cond.find(']', i);
745  if (j == i) {
746  auto what = "empty bracket expression";
747  throw Condition_Exception(what);
748  }
749  if (j == cond.npos) {
750  auto what = "opening bracket has no matching "
751  "closing bracket";
752  throw Condition_Exception(what);
753  }
754  ++num_cp;
755  i = j + 1;
756  }
757  }
758 }
759 
760 auto inline Condition::match_prefix(Str_View s) const -> bool
761 {
762  if (size(s) < num_cp)
763  return false;
764 
765  auto s_i = size_t(0);
766  auto cond_i = size_t(0);
767  for (; s_i != size(s) && cond_i != size(cond); ++cond_i) {
768  auto s_cu = s[s_i];
769  auto cond_cu = cond[cond_i];
770  switch (cond_cu) {
771  default:
772  if (s_cu != cond_cu)
773  return false;
774  ++s_i;
775  break;
776  case '.':
777  valid_u8_advance_index(s, s_i);
778  break;
779  case '[':
780  ++cond_i;
781  cond_cu = cond[cond_i];
782  if (cond_cu == '^')
783  ++cond_i;
784  char32_t str_cp;
785  valid_u8_advance_cp(s, s_i, str_cp);
786  auto found = false;
787  while (cond[cond_i] != ']') {
788  char32_t cond_cp;
789  valid_u8_advance_cp(cond, cond_i, cond_cp);
790  if (str_cp == cond_cp)
791  found = true;
792  }
793  if (cond_cu != '^' && !found)
794  return false;
795  if (cond_cu == '^' && found)
796  return false;
797  break;
798  }
799  }
800  return cond_i == size(cond);
801 }
802 
803 struct Prefix {
804  using Str = std::string;
805 
806  char16_t flag = 0;
807  bool cross_product = false;
808  Str stripping;
809  Str appending;
810  Flag_Set cont_flags;
811  Condition condition;
812 
813  auto to_root(Str& word) const -> Str&
814  {
815  return word.replace(0, appending.size(), stripping);
816  }
817  auto to_root_copy(Str word) const -> Str
818  {
819  to_root(word);
820  return word;
821  }
822 
823  auto to_derived(Str& word) const -> Str&
824  {
825  return word.replace(0, stripping.size(), appending);
826  }
827  auto to_derived_copy(Str word) const -> Str
828  {
829  to_derived(word);
830  return word;
831  }
832 
833  auto check_condition(const Str& word) const -> bool
834  {
835  return condition.match_prefix(word);
836  }
837 };
838 
839 struct Suffix {
840  using Str = std::string;
841 
842  char16_t flag = 0;
843  bool cross_product = false;
844  Str stripping;
845  Str appending;
846  Flag_Set cont_flags;
847  Condition condition;
848 
849  auto to_root(Str& word) const -> Str&
850  {
851  return word.replace(word.size() - appending.size(),
852  appending.size(), stripping);
853  }
854  auto to_root_copy(Str word) const -> Str
855  {
856  to_root(word);
857  return word;
858  }
859 
860  auto to_derived(Str& word) const -> Str&
861  {
862  return word.replace(word.size() - stripping.size(),
863  stripping.size(), appending);
864  }
865  auto to_derived_copy(Str word) const -> Str
866  {
867  to_derived(word);
868  return word;
869  }
870 
871  auto check_condition(const Str& word) const -> bool
872  {
873  return condition.match_suffix(word);
874  }
875 };
876 
877 template <class T, class Key_Extr = identity, class Key_Transform = identity>
879  public:
880  using Value_Type = T;
881  using Key_Type =
882  std::remove_reference_t<decltype(std::declval<Key_Extr>()(
883  std::declval<T>()))>;
884  using Char_Type = typename Key_Type::value_type;
885  using Traits = typename Key_Type::traits_type;
886  using Vector_Type = std::vector<T>;
887  using Iterator = typename Vector_Type::const_iterator;
888 
889  private:
890  struct Ebo_Key_Extr : public Key_Extr {};
891  struct Ebo_Key_Transf : public Key_Transform {};
892  struct Ebo : public Ebo_Key_Extr, Ebo_Key_Transf {
893  Vector_Type table;
894  } ebo;
895  std::basic_string<Char_Type> first_letter;
896  std::vector<size_t> prefix_idx_with_first_letter;
897 
898  auto key_extractor() const -> const Ebo_Key_Extr& { return ebo; }
899  auto key_transformator() const -> const Ebo_Key_Transf& { return ebo; }
900  auto& get_table() { return ebo.table; }
901  auto& get_table() const { return ebo.table; }
902 
903  auto sort()
904  {
905  auto& extract_key = key_extractor();
906  auto& transform_key = key_transformator();
907  auto& table = get_table();
908 
909  std::sort(begin(table), end(table), [&](T& a, T& b) {
910  auto&& key_a = transform_key(extract_key(a));
911  auto&& key_b = transform_key(extract_key(b));
912  return key_a < key_b;
913  });
914 
915  first_letter.clear();
916  prefix_idx_with_first_letter.clear();
917  auto first = begin(table);
918  auto last = end(table);
919  auto it = std::find_if_not(first, last, [=](const T& x) {
920  auto&& k = transform_key(extract_key(x));
921  return k.empty();
922  });
923  while (it != last) {
924  auto&& k1 = transform_key(extract_key(*it));
925  first_letter.push_back(k1[0]);
926  prefix_idx_with_first_letter.push_back(it - first);
927 
928  it = std::upper_bound(
929  it, last, k1[0],
930  Comparator{0, extract_key, transform_key});
931  }
932  if (!prefix_idx_with_first_letter.empty())
933  prefix_idx_with_first_letter.push_back(last - first);
934  }
935 
936  struct Comparator {
937  size_t i;
938  Key_Extr extract_key;
939  Key_Transform transform_key;
940  auto operator()(const T& a, Char_Type b) const
941  {
942  auto&& key_a = transform_key(extract_key(a));
943  return Traits::lt(key_a[i], b);
944  }
945  auto operator()(Char_Type a, const T& b) const
946  {
947  auto&& key_b = transform_key(extract_key(b));
948  return Traits::lt(a, key_b[i]);
949  }
950  };
951 
952  public:
953  Prefix_Multiset() = default;
954  explicit Prefix_Multiset(Key_Extr ke, Key_Transform kt = {})
955  : ebo{{ke}, {kt}, {}}
956  {
957  }
958  explicit Prefix_Multiset(const Vector_Type& v, Key_Extr ke = {},
959  Key_Transform kt = {})
960  : ebo{{ke}, {kt}, v}
961  {
962  sort();
963  }
964  explicit Prefix_Multiset(Vector_Type&& v, Key_Extr ke = {},
965  Key_Transform kt = {})
966  : ebo{{ke}, {kt}, std::move(v)}
967  {
968  sort();
969  }
970  Prefix_Multiset(std::initializer_list<T> list, Key_Extr ke = {},
971  Key_Transform kt = {})
972  : ebo{{ke}, {kt}, list}
973  {
974  sort();
975  }
976  auto& operator=(const Vector_Type& v)
977  {
978  get_table() = v;
979  sort();
980  return *this;
981  }
982  auto& operator=(Vector_Type&& v)
983  {
984  get_table() = std::move(v);
985  sort();
986  return *this;
987  }
988  auto& operator=(std::initializer_list<T> list)
989  {
990  get_table() = list;
991  sort();
992  return *this;
993  }
994  auto& data() const { return get_table(); }
995 
996  template <class Func>
997  auto for_each_prefixes_of(const Key_Type& word, Func func) const;
998 
999  template <class OutIter>
1000  auto copy_all_prefixes_of(const Key_Type& word, OutIter o) const
1001  {
1002  for_each_prefixes_of(word, [&](const T& x) { *o++ = x; });
1003  return o;
1004  }
1005 
1007  const Prefix_Multiset* set = {};
1008  Iterator it = {};
1009  Iterator last = {};
1010  const Key_Type* search_key = {};
1011  size_t len = {};
1012  bool valid = false;
1013 
1014  auto advance() -> void;
1015 
1016  public:
1017  using iterator_category = std::input_iterator_tag;
1018  using value_type = T;
1019  using size_type = size_t;
1020  using difference_type = ptrdiff_t;
1021  using reference = const T&;
1022  using pointer = const T*;
1023 
1024  Iter_Prefixes_Of() = default;
1025  Iter_Prefixes_Of(const Prefix_Multiset& set,
1026  const Key_Type& word)
1027  : set(&set), it(set.get_table().begin()),
1028  last(set.get_table().end()), search_key(&word),
1029  valid(true)
1030  {
1031  advance();
1032  }
1033  Iter_Prefixes_Of(const Prefix_Multiset&, Key_Type&&) = delete;
1034  Iter_Prefixes_Of(Prefix_Multiset&&, const Key_Type&) = delete;
1035  Iter_Prefixes_Of(Prefix_Multiset&&, Key_Type&&) = delete;
1036 
1037  auto& operator++()
1038  {
1039  ++it;
1040  advance();
1041  return *this;
1042  }
1043  auto& operator++(int)
1044  {
1045  auto old = *this;
1046  ++*this;
1047  return old;
1048  }
1049  auto& operator*() const { return *it; }
1050  auto operator->() const { return &*it; }
1051  auto operator==(const Iter_Prefixes_Of& other) const
1052  {
1053  return valid == other.valid;
1054  }
1055  auto operator!=(const Iter_Prefixes_Of& other) const
1056  {
1057  return valid != other.valid;
1058  }
1059 
1060  operator bool() const { return valid; }
1061 
1062  auto begin() const { return *this; }
1063  auto end() const { return Iter_Prefixes_Of(); }
1064  };
1065 
1066  auto iterate_prefixes_of(const Key_Type& word) const
1067  {
1068  return Iter_Prefixes_Of(*this, word);
1069  }
1070  auto iterate_prefixes_of(Key_Type&& word) const = delete;
1071 };
1072 template <class T, class Key_Extr, class Key_Transform>
1073 auto Prefix_Multiset<T, Key_Extr, Key_Transform>::Iter_Prefixes_Of::advance()
1074  -> void
1075 {
1076  auto& extract_key = set->key_extractor();
1077  auto& transform_key = set->key_transformator();
1078  auto& table = set->get_table();
1079 
1080  if (len == 0) {
1081  if (it != last) {
1082  if (transform_key(extract_key(*it)).empty())
1083  return;
1084  if (transform_key(*search_key).empty()) {
1085  valid = false;
1086  return;
1087  }
1088 
1089  auto& first_letter = set->first_letter;
1090  auto& prefix_idx_with_first_letter =
1091  set->prefix_idx_with_first_letter;
1092  auto idx =
1093  first_letter.find(transform_key(*search_key)[0]);
1094  if (idx == first_letter.npos) {
1095  valid = false;
1096  return;
1097  }
1098 
1099  auto first = table.begin();
1100  it = first + prefix_idx_with_first_letter[idx];
1101  last = first + prefix_idx_with_first_letter[idx + 1];
1102  ++len;
1103  }
1104  else {
1105  valid = false;
1106  return;
1107  }
1108  }
1109  for (;; ++len) {
1110  if (it != last) {
1111  if (transform_key(extract_key(*it)).size() == len)
1112  return;
1113  if (len == transform_key(*search_key).size()) {
1114  valid = false;
1115  break;
1116  }
1117  }
1118  else {
1119  valid = false;
1120  break;
1121  }
1122  tie(it, last) =
1123  equal_range(it, last, transform_key(*search_key)[len],
1124  Comparator{len, extract_key, transform_key});
1125  }
1126 }
1127 
1128 template <class T, class Key_Extr, class Key_Transform>
1129 template <class Func>
1130 auto Prefix_Multiset<T, Key_Extr, Key_Transform>::for_each_prefixes_of(
1131  const Key_Type& word, Func func) const
1132 {
1133  auto& extract_key = key_extractor();
1134  auto& transform_key = key_transformator();
1135  auto& table = get_table();
1136 
1137  auto first = begin(table);
1138  auto last = end(table);
1139  auto it = first;
1140  for (; it != last; ++it) {
1141  auto&& k = transform_key(extract_key(*it));
1142  if (k.empty())
1143  func(*it);
1144  else
1145  break;
1146  }
1147 
1148  if (it == last)
1149  return;
1150  if (transform_key(word).empty())
1151  return;
1152 
1153  auto idx = first_letter.find(transform_key(word)[0]);
1154  if (idx == first_letter.npos)
1155  return;
1156  first = begin(table) + prefix_idx_with_first_letter[idx];
1157  last = begin(table) + prefix_idx_with_first_letter[idx + 1];
1158 
1159  for (size_t len = 1;; ++len) {
1160  it = first;
1161  for (; it != last &&
1162  transform_key(extract_key(*it)).size() == len;
1163  ++it) {
1164  func(*it);
1165  }
1166  if (it == last)
1167  break;
1168  if (len == transform_key(word).size())
1169  break;
1170  tie(first, last) =
1171  equal_range(it, last, transform_key(word)[len],
1172  Comparator{len, extract_key, transform_key});
1173  }
1174 }
1175 
1176 template <class CharT>
1178  public:
1179  using Str = std::basic_string<CharT>;
1180  using traits_type = typename Str::traits_type;
1181  using value_type = typename Str::value_type;
1182  using size_type = typename Str::size_type;
1183  using const_iterator = typename Str::const_reverse_iterator;
1184 
1185  private:
1186  const_iterator first = {};
1187  size_type sz = {};
1188 
1189  public:
1190  Reversed_String_View() = default;
1191  explicit Reversed_String_View(const Str& s)
1192  : first(s.rbegin()), sz(s.size())
1193  {
1194  }
1195  Reversed_String_View(Str&& s) = delete;
1196  auto& operator[](size_type i) const { return first[i]; }
1197  auto size() const { return sz; }
1198  auto empty() const { return sz == 0; }
1199  auto begin() const { return first; }
1200  auto end() const { return first + sz; }
1201  auto operator<(Reversed_String_View other) const
1202  {
1203  return lexicographical_compare(begin(), end(), other.begin(),
1204  other.end(), traits_type::lt);
1205  }
1206 };
1207 
1208 template <class CharT>
1210  auto operator()(const std::basic_string<CharT>& x) const
1211  {
1212  return Reversed_String_View<CharT>(x);
1213  }
1214  // auto operator()(T&& x) const = delete;
1215 };
1216 
1217 template <class AffixT>
1219  auto& operator()(const AffixT& a) const { return a.appending; }
1220 };
1221 
1222 template <class T, class Key_Extr = identity>
1224  T, Key_Extr,
1226 
1228  using Prefix_Multiset_Type =
1230  using Key_Type = typename Prefix_Multiset_Type::Key_Type;
1231  using Vector_Type = typename Prefix_Multiset_Type::Vector_Type;
1232  Prefix_Multiset_Type table;
1233  Flag_Set all_cont_flags;
1234 
1235  auto populate()
1236  {
1237  for (auto& x : table.data())
1238  all_cont_flags += x.cont_flags;
1239  }
1240 
1241  public:
1242  Prefix_Table() = default;
1243  explicit Prefix_Table(const Vector_Type& t) : table(t) { populate(); }
1244  explicit Prefix_Table(Vector_Type&& t) : table(std::move(t))
1245  {
1246  populate();
1247  }
1248  auto& operator=(const Vector_Type& t)
1249  {
1250  table = t;
1251  populate();
1252  return *this;
1253  }
1254  auto& operator=(Vector_Type&& t)
1255  {
1256  table = std::move(t);
1257  populate();
1258  return *this;
1259  }
1260  auto begin() const { return table.data().begin(); }
1261  auto end() const { return table.data().end(); }
1262  auto size() const { return table.data().size(); }
1263 
1264  auto has_continuation_flags() const
1265  {
1266  return all_cont_flags.size() != 0;
1267  }
1268  auto has_continuation_flag(char16_t flag) const
1269  {
1270  return all_cont_flags.contains(flag);
1271  }
1272  auto iterate_prefixes_of(const Key_Type& word) const
1273  {
1274  return table.iterate_prefixes_of(word);
1275  }
1276  auto iterate_prefixes_of(Key_Type&& word) const = delete;
1277 };
1278 
1280  using Suffix_Multiset_Type =
1282  using Key_Type = typename Suffix_Multiset_Type::Key_Type;
1283  using Vector_Type = typename Suffix_Multiset_Type::Vector_Type;
1284  Suffix_Multiset_Type table;
1285  Flag_Set all_cont_flags;
1286 
1287  auto populate()
1288  {
1289  for (auto& x : table.data())
1290  all_cont_flags += x.cont_flags;
1291  }
1292 
1293  public:
1294  Suffix_Table() = default;
1295  explicit Suffix_Table(const Vector_Type& t) : table(t) { populate(); }
1296  explicit Suffix_Table(Vector_Type&& t) : table(std::move(t))
1297  {
1298  populate();
1299  }
1300  auto& operator=(const Vector_Type& t)
1301  {
1302  table = t;
1303  populate();
1304  return *this;
1305  }
1306  auto& operator=(Vector_Type&& t)
1307  {
1308  table = std::move(t);
1309  populate();
1310  return *this;
1311  }
1312  auto begin() const { return table.data().begin(); }
1313  auto end() const { return table.data().end(); }
1314  auto size() const { return table.data().size(); }
1315 
1316  auto has_continuation_flags() const
1317  {
1318  return all_cont_flags.size() != 0;
1319  }
1320  auto has_continuation_flag(char16_t flag) const
1321  {
1322  return all_cont_flags.contains(flag);
1323  }
1324  auto iterate_suffixes_of(const Key_Type& word) const
1325  {
1326  return table.iterate_prefixes_of(word);
1327  }
1328  auto iterate_suffixes_of(Key_Type&& word) const = delete;
1329 };
1330 
1336  using Str = std::string;
1337  using Str_View = std::string_view;
1338  size_t i = 0;
1339  Str s;
1340 
1341  public:
1342  String_Pair() = default;
1343  template <class Str1>
1344  explicit String_Pair(Str1&& str, size_t i)
1345  : i(i), s(std::forward<Str1>(str))
1346  {
1347  if (i > s.size())
1348  throw std::out_of_range("word split is too long");
1349  }
1350 
1351  template <class Str1, class Str2,
1352  class = std::enable_if_t<
1353  std::is_same<std::remove_reference_t<Str1>, Str>::value &&
1354  std::is_same<std::remove_reference_t<Str2>, Str>::value>>
1355  String_Pair(Str1&& first, Str2&& second)
1356  : i(first.size()) /* must be init before s, before we move
1357  from first */
1358  ,
1359  s(std::forward<Str1>(first) + std::forward<Str2>(second))
1360 
1361  {
1362  }
1363  auto first() const { return Str_View(s).substr(0, i); }
1364  auto second() const { return Str_View(s).substr(i); }
1365  auto first(Str_View x)
1366  {
1367  s.replace(0, i, x);
1368  i = x.size();
1369  }
1370  auto second(Str_View x) { s.replace(i, s.npos, x); }
1371  auto& str() const { return s; }
1372  auto idx() const { return i; }
1373 };
1375  String_Pair begin_end_chars;
1376  std::string replacement;
1377  char16_t first_word_flag = 0;
1378  char16_t second_word_flag = 0;
1379  bool match_first_only_unaffixed_or_zero_affixed = false;
1380 };
1381 
1383  std::vector<std::u16string> rules;
1384  Flag_Set all_flags;
1385 
1386  auto fill_all_flags() -> void;
1387 
1388  public:
1389  Compound_Rule_Table() = default;
1390  explicit Compound_Rule_Table(const std::vector<std::u16string>& tbl)
1391  : rules(tbl)
1392  {
1393  fill_all_flags();
1394  }
1395  explicit Compound_Rule_Table(std::vector<std::u16string>&& tbl)
1396  : rules(move(tbl))
1397  {
1398  fill_all_flags();
1399  }
1400  auto& operator=(const std::vector<std::u16string>& tbl)
1401  {
1402  rules = tbl;
1403  fill_all_flags();
1404  return *this;
1405  }
1406  auto& operator=(std::vector<std::u16string>&& tbl)
1407  {
1408  rules = move(tbl);
1409  fill_all_flags();
1410  return *this;
1411  }
1412  auto empty() const { return rules.empty(); }
1413  auto has_any_of_flags(const Flag_Set& f) const -> bool;
1414  auto match_any_rule(const std::vector<const Flag_Set*>& data) const
1415  -> bool;
1416 };
1417 auto inline Compound_Rule_Table::fill_all_flags() -> void
1418 {
1419  for (auto& f : rules) {
1420  all_flags += f;
1421  }
1422  all_flags.erase(u'?');
1423  all_flags.erase(u'*');
1424 }
1425 
1426 auto inline Compound_Rule_Table::has_any_of_flags(const Flag_Set& f) const
1427  -> bool
1428 {
1429  using std::begin;
1430  using std::end;
1431  struct Out_Iter_One_Bool {
1432  bool* value = nullptr;
1433  auto& operator++() { return *this; }
1434  auto& operator++(int) { return *this; }
1435  auto& operator*() { return *this; }
1436  auto& operator=(char16_t)
1437  {
1438  *value = true;
1439  return *this;
1440  }
1441  };
1442  auto has_intersection = false;
1443  auto out_it = Out_Iter_One_Bool{&has_intersection};
1444  std::set_intersection(begin(all_flags), end(all_flags), begin(f),
1445  end(f), out_it);
1446  return has_intersection;
1447 }
1448 
1449 template <class DataIter, class PatternIter, class FuncEq = std::equal_to<>>
1450 auto match_simple_regex(DataIter data_first, DataIter data_last,
1451  PatternIter pat_first, PatternIter pat_last,
1452  FuncEq eq = FuncEq())
1453 {
1454  auto s = std::stack<std::pair<DataIter, PatternIter>>();
1455  s.emplace(data_first, pat_first);
1456  auto data_it = DataIter();
1457  auto pat_it = PatternIter();
1458  while (!s.empty()) {
1459  std::tie(data_it, pat_it) = s.top();
1460  s.pop();
1461  if (pat_it == pat_last) {
1462  if (data_it == data_last)
1463  return true;
1464  else
1465  return false;
1466  }
1467  auto node_type = *pat_it;
1468  if (pat_it + 1 == pat_last)
1469  node_type = 0;
1470  else
1471  node_type = *(pat_it + 1);
1472  switch (node_type) {
1473  case '?':
1474  s.emplace(data_it, pat_it + 2);
1475  if (data_it != data_last && eq(*data_it, *pat_it))
1476  s.emplace(data_it + 1, pat_it + 2);
1477  break;
1478  case '*':
1479  s.emplace(data_it, pat_it + 2);
1480  if (data_it != data_last && eq(*data_it, *pat_it))
1481  s.emplace(data_it + 1, pat_it);
1482 
1483  break;
1484  default:
1485  if (data_it != data_last && eq(*data_it, *pat_it))
1486  s.emplace(data_it + 1, pat_it + 1);
1487  break;
1488  }
1489  }
1490  return false;
1491 }
1492 
1493 template <class DataRange, class PatternRange, class FuncEq = std::equal_to<>>
1494 auto match_simple_regex(const DataRange& data, const PatternRange& pattern,
1495  FuncEq eq = FuncEq())
1496 {
1497  using namespace std;
1498  return match_simple_regex(begin(data), end(data), begin(pattern),
1499  end(pattern), eq);
1500 }
1501 
1502 auto inline match_compund_rule(const std::vector<const Flag_Set*>& words_data,
1503  const std::u16string& pattern)
1504 {
1505  return match_simple_regex(
1506  words_data, pattern,
1507  [](const Flag_Set* d, char16_t p) { return d->contains(p); });
1508 }
1509 
1510 auto inline Compound_Rule_Table::match_any_rule(
1511  const std::vector<const Flag_Set*>& data) const -> bool
1512 {
1513  return any_of(begin(rules), end(rules), [&](const std::u16string& p) {
1514  return match_compund_rule(data, p);
1515  });
1516 }
1517 
1518 using List_Strings = std::vector<std::string>;
1519 
1521  public:
1522  using Str = std::string;
1523  using Table_Str = std::vector<std::pair<Str, Str>>;
1524  using iterator = typename Table_Str::iterator;
1525  using const_iterator = typename Table_Str::const_iterator;
1526 
1527  private:
1528  Table_Str table;
1529  size_t whole_word_reps_last_idx = 0;
1530  size_t start_word_reps_last_idx = 0;
1531  size_t end_word_reps_last_idx = 0;
1532 
1533  auto order_entries() -> void;
1534 
1535  public:
1536  Replacement_Table() = default;
1537  explicit Replacement_Table(const Table_Str& v) : table(v)
1538  {
1539  order_entries();
1540  }
1541  explicit Replacement_Table(Table_Str&& v) : table(move(v))
1542  {
1543  order_entries();
1544  }
1545 
1546  auto& operator=(const Table_Str& v)
1547  {
1548  table = v;
1549  order_entries();
1550  return *this;
1551  }
1552 
1553  auto& operator=(Table_Str&& v)
1554  {
1555  table = move(v);
1556  order_entries();
1557  return *this;
1558  }
1559 
1560  auto whole_word_replacements() const -> Subrange<const_iterator>
1561  {
1562  return {begin(table), begin(table) + whole_word_reps_last_idx};
1563  }
1564  auto start_word_replacements() const -> Subrange<const_iterator>
1565  {
1566  return {begin(table) + whole_word_reps_last_idx,
1567  begin(table) + start_word_reps_last_idx};
1568  }
1569  auto end_word_replacements() const -> Subrange<const_iterator>
1570  {
1571  return {begin(table) + start_word_reps_last_idx,
1572  begin(table) + end_word_reps_last_idx};
1573  }
1574  auto any_place_replacements() const -> Subrange<const_iterator>
1575  {
1576  return {begin(table) + end_word_reps_last_idx, end(table)};
1577  }
1578 };
1579 auto inline Replacement_Table::order_entries() -> void
1580 {
1581  auto it = remove_if(begin(table), end(table), [](auto& p) {
1582  auto& s = p.first;
1583  return s.empty() ||
1584  (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
1585  });
1586  table.erase(it, end(table));
1587 
1588  auto is_start_word_pat = [=](auto& x) { return x.first[0] == '^'; };
1589  auto is_end_word_pat = [=](auto& x) { return x.first.back() == '$'; };
1590 
1591  auto start_word_reps_last =
1592  partition(begin(table), end(table), is_start_word_pat);
1593  start_word_reps_last_idx = start_word_reps_last - begin(table);
1594  for_each(begin(table), start_word_reps_last,
1595  [](auto& e) { e.first.erase(0, 1); });
1596 
1597  auto whole_word_reps_last =
1598  partition(begin(table), start_word_reps_last, is_end_word_pat);
1599  whole_word_reps_last_idx = whole_word_reps_last - begin(table);
1600  for_each(begin(table), whole_word_reps_last,
1601  [](auto& e) { e.first.pop_back(); });
1602 
1603  auto end_word_reps_last =
1604  partition(start_word_reps_last, end(table), is_end_word_pat);
1605  end_word_reps_last_idx = end_word_reps_last - begin(table);
1606  for_each(start_word_reps_last, end_word_reps_last,
1607  [](auto& e) { e.first.pop_back(); });
1608 }
1609 
1611  using Str = std::string;
1612  Str chars;
1613  std::vector<Str> strings;
1614 
1615  auto parse(const Str& s) -> void;
1616  Similarity_Group() = default;
1617  explicit Similarity_Group(const Str& s) { parse(s); }
1618  auto& operator=(const Str& s)
1619  {
1620  parse(s);
1621  return *this;
1622  }
1623 };
1624 auto inline Similarity_Group::parse(const Str& s) -> void
1625 {
1626  auto i = size_t(0);
1627  for (;;) {
1628  auto j = s.find('(', i);
1629  chars.append(s, i, j - i);
1630  if (j == s.npos)
1631  break;
1632  i = j + 1;
1633  j = s.find(')', i);
1634  if (j == s.npos)
1635  break;
1636  auto len = j - i;
1637  if (len == 1)
1638  chars += s[i];
1639  else if (len > 1)
1640  strings.push_back(s.substr(i, len));
1641  i = j + 1;
1642  }
1643 }
1644 
1646  using Char_Type = char; // not aware of unicode
1647  using Str = std::string;
1648  using Pair_Str = std::pair<Str, Str>;
1649 
1650  struct Phonet_Match_Result {
1651  size_t count_matched = 0;
1652  size_t go_back_before_replace = 0;
1653  size_t priority = 5;
1654  bool go_back_after_replace = false;
1655  bool treat_next_as_begin = false;
1656  operator bool() { return count_matched; }
1657  };
1658 
1659  std::vector<std::pair<Str, Str>> table;
1660  auto order() -> void;
1661  auto static match(const Str& data, size_t i, const Str& pattern,
1662  bool at_begin) -> Phonet_Match_Result;
1663 
1664  public:
1665  Phonetic_Table() = default;
1666  explicit Phonetic_Table(const std::vector<Pair_Str>& v) : table(v)
1667  {
1668  order();
1669  }
1670  explicit Phonetic_Table(std::vector<Pair_Str>&& v) : table(move(v))
1671  {
1672  order();
1673  }
1674  auto& operator=(const std::vector<Pair_Str>& v)
1675  {
1676  table = v;
1677  order();
1678  return *this;
1679  }
1680  auto& operator=(std::vector<Pair_Str>&& v)
1681  {
1682  table = move(v);
1683  order();
1684  return *this;
1685  }
1686  auto replace(Str& word) const -> bool;
1687 };
1688 
1689 auto inline Phonetic_Table::order() -> void
1690 {
1691  stable_sort(begin(table), end(table), [](auto& pair1, auto& pair2) {
1692  if (pair2.first.empty())
1693  return false;
1694  if (pair1.first.empty())
1695  return true;
1696  return pair1.first[0] < pair2.first[0];
1697  });
1698  auto it = find_if_not(begin(table), end(table),
1699  [](auto& p) { return p.first.empty(); });
1700  table.erase(begin(table), it);
1701  for (auto& r : table) {
1702  if (r.second == "_")
1703  r.second.clear();
1704  }
1705 }
1706 
1707 auto inline Phonetic_Table::match(const Str& data, size_t i, const Str& pattern,
1708  bool at_begin) -> Phonet_Match_Result
1709 {
1710  auto ret = Phonet_Match_Result();
1711  auto j = pattern.find_first_of("(<-0123456789^$");
1712  if (j == pattern.npos)
1713  j = pattern.size();
1714  if (data.compare(i, j, pattern, 0, j) == 0)
1715  ret.count_matched = j;
1716  else
1717  return {};
1718  if (j == pattern.size())
1719  return ret;
1720  if (pattern[j] == '(') {
1721  auto k = pattern.find(')', j);
1722  if (k == pattern.npos)
1723  return {}; // bad rule
1724  auto x = std::char_traits<Char_Type>::find(
1725  &pattern[j + 1], k - (j + 1), data[i + j]);
1726  if (!x)
1727  return {};
1728  j = k + 1;
1729  ret.count_matched += 1;
1730  }
1731  if (j == pattern.size())
1732  return ret;
1733  if (pattern[j] == '<') {
1734  ret.go_back_after_replace = true;
1735  ++j;
1736  }
1737  auto k = pattern.find_first_not_of('-', j);
1738  if (k == pattern.npos) {
1739  k = pattern.size();
1740  ret.go_back_before_replace = k - j;
1741  if (ret.go_back_before_replace >= ret.count_matched)
1742  return {}; // bad rule
1743  return ret;
1744  }
1745  else {
1746  ret.go_back_before_replace = k - j;
1747  if (ret.go_back_before_replace >= ret.count_matched)
1748  return {}; // bad rule
1749  }
1750  j = k;
1751  if (pattern[j] >= '0' && pattern[j] <= '9') {
1752  ret.priority = pattern[j] - '0';
1753  ++j;
1754  }
1755  if (j == pattern.size())
1756  return ret;
1757  if (pattern[j] == '^') {
1758  if (!at_begin)
1759  return {};
1760  ++j;
1761  }
1762  if (j == pattern.size())
1763  return ret;
1764  if (pattern[j] == '^') {
1765  ret.treat_next_as_begin = true;
1766  ++j;
1767  }
1768  if (j == pattern.size())
1769  return ret;
1770  if (pattern[j] != '$')
1771  return {}; // bad rule, no other char is allowed at this point
1772  if (i + ret.count_matched == data.size())
1773  return ret;
1774  return {};
1775 }
1776 
1777 auto inline Phonetic_Table::replace(Str& word) const -> bool
1778 {
1779  struct Cmp {
1780  auto operator()(Char_Type c, const Pair_Str& s)
1781  {
1782  return c < s.first[0];
1783  }
1784  auto operator()(const Pair_Str& s, Char_Type c)
1785  {
1786  return s.first[0] < c;
1787  }
1788  };
1789  if (table.empty())
1790  return false;
1791  auto ret = false;
1792  auto treat_next_as_begin = true;
1793  size_t count_go_backs_after_replace = 0; // avoid infinite loop
1794  for (size_t i = 0; i != word.size(); ++i) {
1795  auto rules =
1796  equal_range(begin(table), end(table), word[i], Cmp());
1797  for (auto& r : Subrange(rules)) {
1798  auto rule = &r;
1799  auto m1 = match(word, i, r.first, treat_next_as_begin);
1800  if (!m1)
1801  continue;
1802  if (!m1.go_back_before_replace) {
1803  auto j = i + m1.count_matched - 1;
1804  auto rules2 = equal_range(
1805  begin(table), end(table), word[j], Cmp());
1806  for (auto& r2 : Subrange(rules2)) {
1807  auto m2 =
1808  match(word, j, r2.first, false);
1809  if (m2 && m2.priority >= m1.priority) {
1810  i = j;
1811  rule = &r2;
1812  m1 = m2;
1813  break;
1814  }
1815  }
1816  }
1817  word.replace(
1818  i, m1.count_matched - m1.go_back_before_replace,
1819  rule->second);
1820  treat_next_as_begin = m1.treat_next_as_begin;
1821  if (m1.go_back_after_replace &&
1822  count_go_backs_after_replace < 100) {
1823  count_go_backs_after_replace++;
1824  }
1825  else {
1826  i += rule->second.size();
1827  }
1828  --i;
1829  ret = true;
1830  break;
1831  }
1832  }
1833  return ret;
1834 }
1835 NUSPELL_END_INLINE_NAMESPACE
1836 } // namespace nuspell
1837 #endif // NUSPELL_STRUCTURES_HXX
Definition: structures.hxx:440
Definition: structures.hxx:1382
Definition: structures.hxx:655
Definition: structures.hxx:525
Definition: structures.hxx:1645
Definition: structures.hxx:1006
Definition: structures.hxx:878
Definition: structures.hxx:1227
Definition: structures.hxx:1520
Definition: structures.hxx:1177
Definition: structures.hxx:1335
Definition: structures.hxx:71
Definition: structures.hxx:40
Definition: structures.hxx:315
Definition: structures.hxx:1279
Library main namespace.
Definition: aff_data.cxx:33
Definition: structures.hxx:1374
Definition: structures.hxx:647
Definition: structures.hxx:1218
Definition: structures.hxx:803
Definition: structures.hxx:1610
Definition: structures.hxx:1209
Definition: structures.hxx:839
Definition: structures.hxx:516