Nuspell
spellchecker
structures.hxx
1 /* Copyright 2018-2020 Dimitrij Mijoski
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #ifndef NUSPELL_STRUCTURES_HXX
20 #define NUSPELL_STRUCTURES_HXX
21 
22 #include <algorithm>
23 #include <cmath>
24 #include <functional>
25 #include <iterator>
26 #include <stack>
27 #include <stdexcept>
28 #include <string>
29 #include <string_view>
30 #include <utility>
31 #include <vector>
32 
33 namespace nuspell {
34 inline namespace v4 {
35 #define NUSPELL_LITERAL(T, x) ::nuspell::literal_choose<T>(x, L##x)
36 
37 template <class CharT>
38 auto constexpr literal_choose(const char* narrow, const wchar_t* wide);
39 template <>
40 auto constexpr literal_choose<char>(const char* narrow, const wchar_t*)
41 {
42  return narrow;
43 }
44 template <>
45 auto constexpr literal_choose<wchar_t>(const char*, const wchar_t* wide)
46 {
47  return wide;
48 }
49 
50 template <class It>
51 class Subrange {
52  using Iter_Category =
53  typename std::iterator_traits<It>::iterator_category;
54  It a = {};
55  It b = {};
56 
57  public:
58  Subrange() = default;
59  Subrange(std::pair<It, It> p) : a(p.first), b(p.second) {}
60  Subrange(It first, It last) : a(first), b(last) {}
61  template <class Range>
62  Subrange(Range& r) : a(r.begin()), b(r.end())
63  {
64  }
65  auto begin() const -> It { return a; }
66  auto end() const -> It { return b; }
67 };
68 // CTAD
69 template <class Range>
71 template <class Range>
73 
81 template <class CharT>
82 class String_Set {
83  private:
84  std::basic_string<CharT> d;
85  auto sort_uniq() -> void
86  {
87  auto first = begin();
88  auto last = end();
89  using t = traits_type;
90  sort(first, last, t::lt);
91  d.erase(unique(first, last, t::eq), last);
92  }
93  struct Char_Traits_Less_Than {
94  auto operator()(CharT a, CharT b) const noexcept
95  {
96  return traits_type::lt(a, b);
97  }
98  };
99 
100  public:
101  using Str = std::basic_string<CharT>;
102  using traits_type = typename Str::traits_type;
103 
104  using key_type = typename Str::value_type;
105  using key_compare = Char_Traits_Less_Than;
106  using value_type = typename Str::value_type;
107  using value_compare = key_compare;
108  using allocator_type = typename Str::allocator_type;
109  using pointer = typename Str::pointer;
110  using const_pointer = typename Str::const_pointer;
111  using reference = typename Str::reference;
112  using const_reference = typename Str::const_reference;
113  using size_type = typename Str::size_type;
114  using difference_type = typename Str::difference_type;
115  using iterator = typename Str::iterator;
116  using const_iterator = typename Str::const_iterator;
117  using reverse_iterator = typename Str::reverse_iterator;
118  using const_reverse_iterator = typename Str::const_reverse_iterator;
119 
120  String_Set() = default;
121  explicit String_Set(const Str& s) : d(s) { sort_uniq(); }
122  explicit String_Set(Str&& s) : d(move(s)) { sort_uniq(); }
123  explicit String_Set(const CharT* s) : d(s) { sort_uniq(); }
124  template <class InputIterator>
125  String_Set(InputIterator first, InputIterator last) : d(first, last)
126  {
127  sort_uniq();
128  }
129  String_Set(std::initializer_list<value_type> il) : d(il)
130  {
131  sort_uniq();
132  }
133 
134  auto& operator=(const Str& s)
135  {
136  d = s;
137  sort_uniq();
138  return *this;
139  }
140  auto& operator=(Str&& s)
141  {
142  d = move(s);
143  sort_uniq();
144  return *this;
145  }
146  auto& operator=(std::initializer_list<value_type> il)
147  {
148  d = il;
149  sort_uniq();
150  return *this;
151  }
152  auto& operator=(const CharT* s)
153  {
154  d = s;
155  sort_uniq();
156  return *this;
157  }
158 
159  // non standard underlying storage access:
160  auto& data() const noexcept { return d; }
161  operator const Str&() const noexcept { return d; }
162 
163  // iterators:
164  iterator begin() noexcept { return d.begin(); }
165  const_iterator begin() const noexcept { return d.begin(); }
166  iterator end() noexcept { return d.end(); }
167  const_iterator end() const noexcept { return d.end(); }
168 
169  reverse_iterator rbegin() noexcept { return d.rbegin(); }
170  const_reverse_iterator rbegin() const noexcept { return d.rbegin(); }
171  reverse_iterator rend() noexcept { return d.rend(); }
172  const_reverse_iterator rend() const noexcept { return d.rend(); }
173 
174  const_iterator cbegin() const noexcept { return d.cbegin(); }
175  const_iterator cend() const noexcept { return d.cend(); }
176  const_reverse_iterator crbegin() const noexcept { return d.crbegin(); }
177  const_reverse_iterator crend() const noexcept { return d.crend(); }
178 
179  // capacity:
180  bool empty() const noexcept { return d.empty(); }
181  size_type size() const noexcept { return d.size(); }
182  size_type max_size() const noexcept { return d.max_size(); }
183 
184  // modifiers:
185  std::pair<iterator, bool> insert(value_type x)
186  {
187  auto it = lower_bound(x);
188  if (it != end() && *it == x) {
189  return {it, false};
190  }
191  auto ret = d.insert(it, x);
192  return {ret, true};
193  }
194  iterator insert(iterator hint, value_type x)
195  {
196  if (hint == end() || traits_type::lt(x, *hint)) {
197  if (hint == begin() ||
198  traits_type::lt(*(hint - 1), x)) {
199  return d.insert(hint, x);
200  }
201  }
202  return insert(x).first;
203  }
204 
205  // iterator insert(const_iterator hint, value_type&& x);
206  template <class InputIterator>
207  void insert(InputIterator first, InputIterator last)
208  {
209  d.insert(end(), first, last);
210  sort_uniq();
211  }
212  void insert(std::initializer_list<value_type> il)
213  {
214  d.insert(end(), il);
215  sort_uniq();
216  }
217  template <class... Args>
218  std::pair<iterator, bool> emplace(Args&&... args)
219  {
220  return insert(CharT(args...));
221  }
222 
223  template <class... Args>
224  iterator emplace_hint(iterator hint, Args&&... args)
225  {
226  return insert(hint, CharT(args...));
227  }
228 
229  iterator erase(iterator position) { return d.erase(position); }
230  size_type erase(key_type x)
231  {
232  auto i = d.find(x);
233  if (i != d.npos) {
234  d.erase(i, 1);
235  return true;
236  }
237  return false;
238  }
239  iterator erase(iterator first, iterator last)
240  {
241  return d.erase(first, last);
242  }
243  void swap(String_Set& s) { d.swap(s.d); }
244  void clear() noexcept { d.clear(); }
245 
246  // non standrd modifiers:
247  auto insert(const Str& s) -> void
248  {
249  d += s;
250  sort_uniq();
251  }
252  auto& operator+=(const Str& s)
253  {
254  insert(s);
255  return *this;
256  }
257 
258  // observers:
259  key_compare key_comp() const { return Char_Traits_Less_Than(); }
260  value_compare value_comp() const { return key_comp(); }
261 
262  // set operations:
263  private:
264  auto lookup(key_type x) const
265  {
266  auto i = d.find(x);
267  if (i == d.npos)
268  i = d.size();
269  return i;
270  }
271 
272  public:
273  iterator find(key_type x) { return begin() + lookup(x); }
274  const_iterator find(key_type x) const { return begin() + lookup(x); }
275  size_type count(key_type x) const { return d.find(x) != d.npos; }
276 
277  iterator lower_bound(key_type x)
278  {
279  return std::lower_bound(begin(), end(), x, traits_type::lt);
280  }
281 
282  const_iterator lower_bound(key_type x) const
283  {
284  return std::lower_bound(begin(), end(), x, traits_type::lt);
285  }
286  iterator upper_bound(key_type x)
287  {
288  return std::upper_bound(begin(), end(), x, traits_type::lt);
289  }
290 
291  const_iterator upper_bound(key_type x) const
292  {
293  return std::upper_bound(begin(), end(), x, traits_type::lt);
294  }
295  std::pair<iterator, iterator> equal_range(key_type x)
296  {
297  return std::equal_range(begin(), end(), x, traits_type::lt);
298  }
299 
300  std::pair<const_iterator, const_iterator> equal_range(key_type x) const
301  {
302  return std::equal_range(begin(), end(), x, traits_type::lt);
303  }
304 
305  // non standard set operations:
306  bool contains(key_type x) const { return count(x); }
307 
308  // compare
309  bool operator<(const String_Set& rhs) const { return d < rhs.d; }
310  bool operator<=(const String_Set& rhs) const { return d <= rhs.d; }
311  bool operator==(const String_Set& rhs) const { return d == rhs.d; }
312  bool operator!=(const String_Set& rhs) const { return d != rhs.d; }
313  bool operator>=(const String_Set& rhs) const { return d >= rhs.d; }
314  bool operator>(const String_Set& rhs) const { return d > rhs.d; }
315 };
316 
317 template <class CharT>
318 auto swap(String_Set<CharT>& a, String_Set<CharT>& b)
319 {
320  a.swap(b);
321 }
322 
324 
325 template <class CharT>
327  public:
328  using Str = std::basic_string<CharT>;
329  using Str_View = std::basic_string_view<CharT>;
330  using Pair_Str = std::pair<Str, Str>;
331  using Table_Pairs = std::vector<Pair_Str>;
332 
333  private:
334  Table_Pairs table;
335  auto sort_uniq() -> void;
336  auto find_match(Str_View s) const;
337 
338  public:
339  Substr_Replacer() = default;
340  explicit Substr_Replacer(const Table_Pairs& v) : table(v)
341  {
342  sort_uniq();
343  }
344  explicit Substr_Replacer(Table_Pairs&& v) : table(move(v))
345  {
346  sort_uniq();
347  }
348 
349  auto& operator=(const Table_Pairs& v)
350  {
351  table = v;
352  sort_uniq();
353  return *this;
354  }
355  auto& operator=(Table_Pairs&& v)
356  {
357  table = move(v);
358  sort_uniq();
359  return *this;
360  }
361 
362  auto replace(Str& s) const -> Str&;
363  auto replace_copy(Str s) const -> Str
364  {
365  replace(s);
366  return s;
367  }
368 };
369 template <class CharT>
371 {
372  auto first = begin(table);
373  auto last = end(table);
374  sort(first, last, [](auto& a, auto& b) { return a.first < b.first; });
375  auto it = unique(first, last,
376  [](auto& a, auto& b) { return a.first == b.first; });
377  table.erase(it, last);
378 
379  // remove empty key ""
380  if (!table.empty() && table.front().first.empty())
381  table.erase(begin(table));
382 }
383 
384 template <class CharT>
385 auto Substr_Replacer<CharT>::find_match(Str_View s) const
386 {
387  auto& t = table;
388  struct Comparer_Str_Rep {
389  auto static cmp_prefix_of(const Str& p, Str_View of)
390  {
391  return p.compare(0, p.npos, of.data(),
392  std::min(p.size(), of.size()));
393  }
394  auto operator()(const Pair_Str& a, Str_View b)
395  {
396  return cmp_prefix_of(a.first, b) < 0;
397  }
398  auto operator()(Str_View a, const Pair_Str& b)
399  {
400  return cmp_prefix_of(b.first, a) > 0;
401  }
402  auto static eq(const Pair_Str& a, Str_View b)
403  {
404  return cmp_prefix_of(a.first, b) == 0;
405  }
406  };
407  Comparer_Str_Rep csr;
408  auto it = begin(t);
409  auto last_match = end(t);
410  for (;;) {
411  auto it2 = upper_bound(it, end(t), s, csr);
412  if (it2 == it) {
413  // not found, s is smaller that the range
414  break;
415  }
416  --it2;
417  if (csr.eq(*it2, s)) {
418  // Match found. Try another search maybe for
419  // longer.
420  last_match = it2;
421  it = ++it2;
422  }
423  else {
424  // not found, s is greater that the range
425  break;
426  }
427  }
428  return last_match;
429 }
430 
431 template <class CharT>
432 auto Substr_Replacer<CharT>::replace(Str& s) const -> Str&
433 {
434 
435  if (table.empty())
436  return s;
437  for (size_t i = 0; i < s.size(); /*no increment here*/) {
438  auto substr = Str_View(&s[i], s.size() - i);
439  auto it = find_match(substr);
440  if (it != end(table)) {
441  auto& match = *it;
442  // match found. match.first is the found string,
443  // match.second is the replacement.
444  s.replace(i, match.first.size(), match.second);
445  i += match.second.size();
446  continue;
447  }
448  ++i;
449  }
450  return s;
451 }
452 
453 template <class CharT>
454 class Break_Table {
455  public:
456  using Str = std::basic_string<CharT>;
457  using Table_Str = std::vector<Str>;
458  using iterator = typename Table_Str::iterator;
459  using const_iterator = typename Table_Str::const_iterator;
460 
461  private:
462  Table_Str table;
463  size_t start_word_breaks_last_idx = 0;
464  size_t end_word_breaks_last_idx = 0;
465 
466  auto order_entries() -> void;
467 
468  public:
469  Break_Table() = default;
470  explicit Break_Table(const Table_Str& v) : table(v) { order_entries(); }
471  explicit Break_Table(Table_Str&& v) : table(move(v))
472  {
473  order_entries();
474  }
475 
476  auto& operator=(const Table_Str& v)
477  {
478  table = v;
479  order_entries();
480  return *this;
481  }
482 
483  auto& operator=(Table_Str&& v)
484  {
485  table = move(v);
486  order_entries();
487  return *this;
488  }
489 
490  auto start_word_breaks() const -> Subrange<const_iterator>
491  {
492  return {begin(table),
493  begin(table) + start_word_breaks_last_idx};
494  }
495  auto end_word_breaks() const -> Subrange<const_iterator>
496  {
497  return {begin(table) + start_word_breaks_last_idx,
498  begin(table) + end_word_breaks_last_idx};
499  }
500  auto middle_word_breaks() const -> Subrange<const_iterator>
501  {
502  return {begin(table) + end_word_breaks_last_idx, end(table)};
503  }
504 };
505 template <class CharT>
507 {
508  auto it = remove_if(begin(table), end(table), [](auto& s) {
509  return s.empty() ||
510  (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
511  });
512  table.erase(it, end(table));
513 
514  auto is_start_word_break = [=](auto& x) { return x[0] == '^'; };
515  auto is_end_word_break = [=](auto& x) { return x.back() == '$'; };
516  auto start_word_breaks_last =
517  partition(begin(table), end(table), is_start_word_break);
518  start_word_breaks_last_idx = start_word_breaks_last - begin(table);
519 
520  for_each(begin(table), start_word_breaks_last,
521  [](auto& e) { e.erase(0, 1); });
522 
523  auto end_word_breaks_last =
524  partition(start_word_breaks_last, end(table), is_end_word_break);
525  end_word_breaks_last_idx = end_word_breaks_last - begin(table);
526 
527  for_each(start_word_breaks_last, end_word_breaks_last,
528  [](auto& e) { e.pop_back(); });
529 }
530 
531 struct identity {
532  template <class T>
533  constexpr auto operator()(T&& t) const noexcept -> T&&
534  {
535  return std::forward<T>(t);
536  }
537 };
538 
539 template <class Value, class Key = Value, class KeyExtract = identity>
541  private:
542  using bucket_type = std::vector<Value>;
543  static constexpr float max_load_fact = 7.0 / 8.0;
544  std::vector<bucket_type> data;
545  size_t sz = 0;
546  size_t max_load_factor_capacity = 0;
547 
548  public:
549  using key_type = Key;
550  using value_type = Value;
551  using size_type = std::size_t;
552  using difference_type = std::ptrdiff_t;
553  using hasher = std::hash<Key>;
554  using reference = value_type&;
555  using const_reference = const value_type&;
556  using pointer = typename bucket_type::pointer;
557  using const_pointer = typename bucket_type::const_pointer;
558  using local_iterator = typename bucket_type::iterator;
559  using local_const_iterator = typename bucket_type::const_iterator;
560 
561  Hash_Multiset() = default;
562 
563  auto size() const { return sz; }
564  auto empty() const { return size() == 0; }
565 
566  auto rehash(size_t count)
567  {
568  if (empty()) {
569  size_t capacity = 16;
570  while (capacity <= count)
571  capacity <<= 1;
572  data.resize(capacity);
573  max_load_factor_capacity =
574  std::ceil(capacity * max_load_fact);
575  return;
576  }
577  if (count < size() / max_load_fact)
578  count = size() / max_load_fact;
579  auto n = Hash_Multiset();
580  n.rehash(count);
581  for (auto& b : data) {
582  for (auto& x : b) {
583  n.insert(std::move(x));
584  }
585  }
586  data.swap(n.data);
587  sz = n.sz;
588  max_load_factor_capacity = n.max_load_factor_capacity;
589  }
590 
591  auto reserve(size_t count) -> void
592  {
593  rehash(std::ceil(count / max_load_fact));
594  }
595 
596  auto insert(value_type&& value)
597  {
598  using namespace std;
599  auto hash = hasher();
600  auto key_extract = KeyExtract();
601  if (sz == max_load_factor_capacity) {
602  reserve(sz + 1);
603  }
604  auto&& key = key_extract(value);
605  auto h = hash(key);
606  auto h_mod = h & (data.size() - 1);
607  auto& bucket = data[h_mod];
608  if (bucket.size() == 0 || bucket.size() == 1 ||
609  key == key_extract(bucket.back())) {
610  bucket.push_back(move(value));
611  ++sz;
612  return end(bucket) - 1;
613  }
614  auto last =
615  std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
616  return key == key_extract(x);
617  });
618  if (last != rend(bucket)) {
619  auto ret = bucket.insert(last.base(), move(value));
620  ++sz;
621  return ret;
622  }
623 
624  bucket.push_back(move(value));
625  ++sz;
626  return end(bucket) - 1;
627  }
628  template <class... Args>
629  auto emplace(Args&&... a)
630  {
631  return insert(value_type(std::forward<Args>(a)...));
632  }
633 
634  auto equal_range(const key_type& key) const
635  -> std::pair<local_const_iterator, local_const_iterator>
636  {
637  using namespace std;
638  auto hash = hasher();
639  auto key_extract = KeyExtract();
640  if (data.empty())
641  return {};
642  auto h = hash(key);
643  auto h_mod = h & (data.size() - 1);
644  auto& bucket = data[h_mod];
645  if (bucket.empty())
646  return {begin(bucket), begin(bucket)}; // ret empty
647  // return {} here ^^^^^^^ is OK bug GCC debug iterators have
648  // this bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70303
649  if (bucket.size() == 1) {
650  if (key == key_extract(bucket.front()))
651  return {begin(bucket), end(bucket)};
652  return {begin(bucket), begin(bucket)}; // ret empty
653  }
654  auto first =
655  std::find_if(begin(bucket), end(bucket), [&](auto& x) {
656  return key == key_extract(x);
657  });
658  if (first == end(bucket))
659  return {begin(bucket), begin(bucket)}; // ret empty
660  auto next = first + 1;
661  if (next == end(bucket) || key != key_extract(*next))
662  return {first, next};
663  auto last =
664  std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
665  return key == key_extract(x);
666  });
667  return {first, last.base()};
668  }
669 
670  auto bucket_count() const -> size_type { return data.size(); }
671  auto bucket_data(size_type i) const { return Subrange(data[i]); }
672 };
673 
674 struct Condition_Exception : public std::runtime_error {
675  using std::runtime_error::runtime_error;
676 };
677 
682 template <class CharT>
683 class Condition {
684  using Str = std::basic_string<CharT>;
685  enum Span_Type { NORMAL, DOT, ANY_OF, NONE_OF };
686  struct Span {
687  size_t pos = {};
688  size_t len = {};
689  Span_Type type = {};
690  Span() = default;
691  Span(size_t pos, size_t len, Span_Type type)
692  : pos(pos), len(len), type(type)
693  {
694  }
695  };
696 
697  Str cond;
698  std::vector<Span> spans; // pos, len, type
699  size_t length = 0;
700 
701  auto construct() -> void;
702 
703  public:
704  Condition() = default;
705  explicit Condition(const Str& condition) : cond(condition)
706  {
707  construct();
708  }
709  explicit Condition(Str&& condition) : cond(move(condition))
710  {
711  construct();
712  }
713  explicit Condition(const CharT* condition) : cond(condition)
714  {
715  construct();
716  }
717  auto& operator=(const Str& condition)
718  {
719  cond = condition;
720  length = 0;
721  construct();
722  return *this;
723  }
724  auto& operator=(Str&& condition)
725  {
726  cond = std::move(condition);
727  length = 0;
728  construct();
729  return *this;
730  }
731  auto& operator=(const CharT* condition)
732  {
733  cond = condition;
734  length = 0;
735  construct();
736  return *this;
737  }
738  auto match(const Str& s, size_t pos = 0, size_t len = Str::npos) const
739  -> bool;
740  auto match_prefix(const Str& s) const { return match(s, 0, length); }
741  auto match_suffix(const Str& s) const
742  {
743  if (length > s.size())
744  return false;
745  return match(s, s.size() - length, length);
746  }
747 };
748 template <class CharT>
749 auto Condition<CharT>::construct() -> void
750 {
751  size_t i = 0;
752  for (; i != cond.size();) {
753  size_t j = cond.find_first_of(NUSPELL_LITERAL(CharT, "[]."), i);
754  if (i != j) {
755  if (j == cond.npos) {
756  spans.emplace_back(i, cond.size() - i, NORMAL);
757  length += cond.size() - i;
758  break;
759  }
760  spans.emplace_back(i, j - i, NORMAL);
761  length += j - i;
762  i = j;
763  }
764  if (cond[i] == '.') {
765  spans.emplace_back(i, 1, DOT);
766  ++length;
767  ++i;
768  continue;
769  }
770  if (cond[i] == ']') {
771  auto what =
772  "closing bracket has no matching opening bracket";
773  throw Condition_Exception(what);
774  }
775  if (cond[i] == '[') {
776  ++i;
777  if (i == cond.size()) {
778  auto what = "opening bracket has no matching "
779  "closing bracket";
780  throw Condition_Exception(what);
781  }
782  Span_Type type;
783  if (cond[i] == '^') {
784  type = NONE_OF;
785  ++i;
786  }
787  else {
788  type = ANY_OF;
789  }
790  j = cond.find(']', i);
791  if (j == i) {
792  auto what = "empty bracket expression";
793  throw Condition_Exception(what);
794  }
795  if (j == cond.npos) {
796  auto what = "opening bracket has no matching "
797  "closing bracket";
798  throw Condition_Exception(what);
799  }
800  spans.emplace_back(i, j - i, type);
801  ++length;
802  i = j + 1;
803  }
804  }
805 }
806 
807 template <class CharT>
808 auto Condition<CharT>::match(const Str& s, size_t pos, size_t len) const -> bool
809 {
810  if (pos > s.size()) {
811  throw std::out_of_range(
812  "position on the string is out of bounds");
813  }
814  if (s.size() - pos < len)
815  len = s.size() - pos;
816  if (len != length)
817  return false;
818 
819  size_t i = pos;
820  for (auto& x : spans) {
821  using tr = typename Str::traits_type;
822  switch (x.type) {
823  case NORMAL:
824  if (tr::compare(&s[i], &cond[x.pos], x.len) == 0)
825  i += x.len;
826  else
827  return false;
828  break;
829  case DOT:
830  ++i;
831  break;
832  case ANY_OF:
833  if (tr::find(&cond[x.pos], x.len, s[i]))
834  ++i;
835  else
836  return false;
837  break;
838  case NONE_OF:
839  if (tr::find(&cond[x.pos], x.len, s[i]))
840  return false;
841  else
842  ++i;
843  break;
844  }
845  }
846  return true;
847 }
848 
849 template <class CharT>
850 struct Prefix {
851  using Str = std::basic_string<CharT>;
852  using Cond = Condition<CharT>;
853  using value_type = CharT;
854 
855  char16_t flag = 0;
856  bool cross_product = false;
857  Str stripping;
858  Str appending;
859  Flag_Set cont_flags;
860  Cond condition;
861 
862  auto to_root(Str& word) const -> Str&
863  {
864  return word.replace(0, appending.size(), stripping);
865  }
866  auto to_root_copy(Str word) const -> Str
867  {
868  to_root(word);
869  return word;
870  }
871 
872  auto to_derived(Str& word) const -> Str&
873  {
874  return word.replace(0, stripping.size(), appending);
875  }
876  auto to_derived_copy(Str word) const -> Str
877  {
878  to_derived(word);
879  return word;
880  }
881 
882  auto check_condition(const Str& word) const -> bool
883  {
884  return condition.match_prefix(word);
885  }
886 };
887 
888 template <class CharT>
889 struct Suffix {
890  using Str = std::basic_string<CharT>;
891  using Cond = Condition<CharT>;
892  using value_type = CharT;
893 
894  char16_t flag = 0;
895  bool cross_product = false;
896  Str stripping;
897  Str appending;
898  Flag_Set cont_flags;
899  Cond condition;
900 
901  auto to_root(Str& word) const -> Str&
902  {
903  return word.replace(word.size() - appending.size(),
904  appending.size(), stripping);
905  }
906  auto to_root_copy(Str word) const -> Str
907  {
908  to_root(word);
909  return word;
910  }
911 
912  auto to_derived(Str& word) const -> Str&
913  {
914  return word.replace(word.size() - stripping.size(),
915  stripping.size(), appending);
916  }
917  auto to_derived_copy(Str word) const -> Str
918  {
919  to_derived(word);
920  return word;
921  }
922 
923  auto check_condition(const Str& word) const -> bool
924  {
925  return condition.match_suffix(word);
926  }
927 };
928 
929 template <class T, class Key_Extr = identity, class Key_Transform = identity>
931  public:
932  using Value_Type = T;
933  using Key_Type = std::remove_reference_t<decltype(
934  std::declval<Key_Extr>()(std::declval<T>()))>;
935  using Char_Type = typename Key_Type::value_type;
936  using Traits = typename Key_Type::traits_type;
937  using Vector_Type = std::vector<T>;
938  using Iterator = typename Vector_Type::const_iterator;
939 
940  private:
941  struct Ebo_Key_Extr : public Key_Extr {
942  };
943  struct Ebo_Key_Transf : public Key_Transform {
944  };
945  struct Ebo : public Ebo_Key_Extr, Ebo_Key_Transf {
946  Vector_Type table;
947  } ebo;
948  std::basic_string<Char_Type> first_letter;
949  std::vector<size_t> prefix_idx_with_first_letter;
950 
951  auto key_extractor() const -> const Ebo_Key_Extr& { return ebo; }
952  auto key_transformator() const -> const Ebo_Key_Transf& { return ebo; }
953  auto& get_table() { return ebo.table; }
954  auto& get_table() const { return ebo.table; }
955 
956  auto sort()
957  {
958  auto& extract_key = key_extractor();
959  auto& transform_key = key_transformator();
960  auto& table = get_table();
961 
962  std::sort(begin(table), end(table), [&](T& a, T& b) {
963  auto&& key_a = transform_key(extract_key(a));
964  auto&& key_b = transform_key(extract_key(b));
965  return key_a < key_b;
966  });
967 
968  first_letter.clear();
969  prefix_idx_with_first_letter.clear();
970  auto first = begin(table);
971  auto last = end(table);
972  auto it = std::find_if_not(first, last, [=](const T& x) {
973  auto&& k = transform_key(extract_key(x));
974  return k.empty();
975  });
976  while (it != last) {
977  auto&& k1 = transform_key(extract_key(*it));
978  first_letter.push_back(k1[0]);
979  prefix_idx_with_first_letter.push_back(it - first);
980 
981  it = std::upper_bound(
982  it, last, k1[0],
983  Comparator{0, extract_key, transform_key});
984  }
985  if (!prefix_idx_with_first_letter.empty())
986  prefix_idx_with_first_letter.push_back(last - first);
987  }
988 
989  struct Comparator {
990  size_t i;
991  Key_Extr extract_key;
992  Key_Transform transform_key;
993  auto operator()(const T& a, Char_Type b) const
994  {
995  auto&& key_a = transform_key(extract_key(a));
996  return Traits::lt(key_a[i], b);
997  }
998  auto operator()(Char_Type a, const T& b) const
999  {
1000  auto&& key_b = transform_key(extract_key(b));
1001  return Traits::lt(a, key_b[i]);
1002  }
1003  };
1004 
1005  public:
1006  Prefix_Multiset() = default;
1007  explicit Prefix_Multiset(Key_Extr ke, Key_Transform kt = {})
1008  : ebo{{ke}, {kt}, {}}
1009  {
1010  }
1011  explicit Prefix_Multiset(const Vector_Type& v, Key_Extr ke = {},
1012  Key_Transform kt = {})
1013  : ebo{{ke}, {kt}, v}
1014  {
1015  sort();
1016  }
1017  explicit Prefix_Multiset(Vector_Type&& v, Key_Extr ke = {},
1018  Key_Transform kt = {})
1019  : ebo{{ke}, {kt}, std::move(v)}
1020  {
1021  sort();
1022  }
1023  Prefix_Multiset(std::initializer_list<T> list, Key_Extr ke = {},
1024  Key_Transform kt = {})
1025  : ebo{{ke}, {kt}, list}
1026  {
1027  sort();
1028  }
1029  auto& operator=(const Vector_Type& v)
1030  {
1031  get_table() = v;
1032  sort();
1033  return *this;
1034  }
1035  auto& operator=(Vector_Type&& v)
1036  {
1037  get_table() = std::move(v);
1038  sort();
1039  return *this;
1040  }
1041  auto& operator=(std::initializer_list<T> list)
1042  {
1043  get_table() = list;
1044  sort();
1045  return *this;
1046  }
1047  auto& data() const { return get_table(); }
1048 
1049  template <class Func>
1050  auto for_each_prefixes_of(const Key_Type& word, Func func) const;
1051 
1052  template <class OutIter>
1053  auto copy_all_prefixes_of(const Key_Type& word, OutIter o) const
1054  {
1055  for_each_prefixes_of(word, [&](const T& x) { *o++ = x; });
1056  return o;
1057  }
1058 
1060  const Prefix_Multiset* set = {};
1061  Iterator it = {};
1062  Iterator last = {};
1063  const Key_Type* search_key = {};
1064  size_t len = {};
1065  bool valid = false;
1066 
1067  auto advance() -> void;
1068 
1069  public:
1070  using iterator_category = std::input_iterator_tag;
1071  using value_type = T;
1072  using size_type = size_t;
1073  using difference_type = ptrdiff_t;
1074  using reference = const T&;
1075  using pointer = const T*;
1076 
1077  Iter_Prefixes_Of() = default;
1078  Iter_Prefixes_Of(const Prefix_Multiset& set,
1079  const Key_Type& word)
1080  : set(&set), it(set.get_table().begin()),
1081  last(set.get_table().end()), search_key(&word),
1082  valid(true)
1083  {
1084  advance();
1085  }
1086  Iter_Prefixes_Of(const Prefix_Multiset&, Key_Type&&) = delete;
1087  Iter_Prefixes_Of(Prefix_Multiset&&, const Key_Type&) = delete;
1088  Iter_Prefixes_Of(Prefix_Multiset&&, Key_Type&&) = delete;
1089 
1090  auto& operator++()
1091  {
1092  ++it;
1093  advance();
1094  return *this;
1095  }
1096  auto& operator++(int)
1097  {
1098  auto old = *this;
1099  ++*this;
1100  return old;
1101  }
1102  auto& operator*() const { return *it; }
1103  auto operator->() const { return &*it; }
1104  auto operator==(const Iter_Prefixes_Of& other) const
1105  {
1106  return valid == other.valid;
1107  }
1108  auto operator!=(const Iter_Prefixes_Of& other) const
1109  {
1110  return valid != other.valid;
1111  }
1112 
1113  operator bool() const { return valid; }
1114 
1115  auto begin() const { return *this; }
1116  auto end() const { return Iter_Prefixes_Of(); }
1117  };
1118 
1119  auto iterate_prefixes_of(const Key_Type& word) const
1120  {
1121  return Iter_Prefixes_Of(*this, word);
1122  }
1123  auto iterate_prefixes_of(Key_Type&& word) const = delete;
1124 };
1125 template <class T, class Key_Extr, class Key_Transform>
1127  -> void
1128 {
1129  auto& extract_key = set->key_extractor();
1130  auto& transform_key = set->key_transformator();
1131  auto& table = set->get_table();
1132 
1133  if (len == 0) {
1134  if (it != last) {
1135  if (transform_key(extract_key(*it)).empty())
1136  return;
1137  if (transform_key(*search_key).empty()) {
1138  valid = false;
1139  return;
1140  }
1141 
1142  auto& first_letter = set->first_letter;
1143  auto& prefix_idx_with_first_letter =
1144  set->prefix_idx_with_first_letter;
1145  auto idx =
1146  first_letter.find(transform_key(*search_key)[0]);
1147  if (idx == first_letter.npos) {
1148  valid = false;
1149  return;
1150  }
1151 
1152  auto first = table.begin();
1153  it = first + prefix_idx_with_first_letter[idx];
1154  last = first + prefix_idx_with_first_letter[idx + 1];
1155  ++len;
1156  }
1157  else {
1158  valid = false;
1159  return;
1160  }
1161  }
1162  for (;; ++len) {
1163  if (it != last) {
1164  if (transform_key(extract_key(*it)).size() == len)
1165  return;
1166  if (len == transform_key(*search_key).size()) {
1167  valid = false;
1168  break;
1169  }
1170  }
1171  else {
1172  valid = false;
1173  break;
1174  }
1175  tie(it, last) =
1176  equal_range(it, last, transform_key(*search_key)[len],
1177  Comparator{len, extract_key, transform_key});
1178  }
1179 }
1180 
1181 template <class T, class Key_Extr, class Key_Transform>
1182 template <class Func>
1183 auto Prefix_Multiset<T, Key_Extr, Key_Transform>::for_each_prefixes_of(
1184  const Key_Type& word, Func func) const
1185 {
1186  auto& extract_key = key_extractor();
1187  auto& transform_key = key_transformator();
1188  auto& table = get_table();
1189 
1190  auto first = begin(table);
1191  auto last = end(table);
1192  auto it = first;
1193  for (; it != last; ++it) {
1194  auto&& k = transform_key(extract_key(*it));
1195  if (k.empty())
1196  func(*it);
1197  else
1198  break;
1199  }
1200 
1201  if (it == last)
1202  return;
1203  if (transform_key(word).empty())
1204  return;
1205 
1206  auto idx = first_letter.find(transform_key(word)[0]);
1207  if (idx == first_letter.npos)
1208  return;
1209  first = begin(table) + prefix_idx_with_first_letter[idx];
1210  last = begin(table) + prefix_idx_with_first_letter[idx + 1];
1211 
1212  for (size_t len = 1;; ++len) {
1213  it = first;
1214  for (; it != last &&
1215  transform_key(extract_key(*it)).size() == len;
1216  ++it) {
1217  func(*it);
1218  }
1219  if (it == last)
1220  break;
1221  if (len == transform_key(word).size())
1222  break;
1223  tie(first, last) =
1224  equal_range(it, last, transform_key(word)[len],
1225  Comparator{len, extract_key, transform_key});
1226  }
1227 }
1228 
1229 template <class CharT>
1231  public:
1232  using Str = std::basic_string<CharT>;
1233  using traits_type = typename Str::traits_type;
1234  using value_type = typename Str::value_type;
1235  using size_type = typename Str::size_type;
1236  using const_iterator = typename Str::const_reverse_iterator;
1237 
1238  private:
1239  const_iterator first = {};
1240  size_type sz = {};
1241 
1242  public:
1243  Reversed_String_View() = default;
1244  explicit Reversed_String_View(const Str& s)
1245  : first(s.rbegin()), sz(s.size())
1246  {
1247  }
1248  Reversed_String_View(Str&& s) = delete;
1249  auto& operator[](size_type i) const { return first[i]; }
1250  auto size() const { return sz; }
1251  auto empty() const { return sz == 0; }
1252  auto begin() const { return first; }
1253  auto end() const { return first + sz; }
1254  auto operator<(Reversed_String_View other) const
1255  {
1256  return lexicographical_compare(begin(), end(), other.begin(),
1257  other.end(), traits_type::lt);
1258  }
1259 };
1260 
1261 template <class CharT>
1263  auto operator()(const std::basic_string<CharT>& x) const
1264  {
1265  return Reversed_String_View<CharT>(x);
1266  }
1267  // auto operator()(T&& x) const = delete;
1268 };
1269 
1270 template <class AffixT>
1272  auto& operator()(const AffixT& a) const { return a.appending; }
1273 };
1274 
1275 template <class T, class Key_Extr = identity>
1277  T, Key_Extr,
1279 
1280 template <class T, class Key_Extr = identity>
1282 };
1283 
1285  using Prefix_Multiset_Type =
1288  using Key_Type = typename Prefix_Multiset_Type::Key_Type;
1289  using Vector_Type = typename Prefix_Multiset_Type::Vector_Type;
1290  Prefix_Multiset_Type table;
1291  Flag_Set all_cont_flags;
1292 
1293  auto populate()
1294  {
1295  for (auto& x : table.data())
1296  all_cont_flags += x.cont_flags;
1297  }
1298 
1299  public:
1300  Prefix_Table() = default;
1301  explicit Prefix_Table(const Vector_Type& t) : table(t) { populate(); }
1302  explicit Prefix_Table(Vector_Type&& t) : table(std::move(t))
1303  {
1304  populate();
1305  }
1306  auto& operator=(const Vector_Type& t)
1307  {
1308  table = t;
1309  populate();
1310  return *this;
1311  }
1312  auto& operator=(Vector_Type&& t)
1313  {
1314  table = std::move(t);
1315  populate();
1316  return *this;
1317  }
1318  auto begin() const { return table.data().begin(); }
1319  auto end() const { return table.data().end(); }
1320 
1321  auto has_continuation_flags() const
1322  {
1323  return all_cont_flags.size() != 0;
1324  }
1325  auto has_continuation_flag(char16_t flag) const
1326  {
1327  return all_cont_flags.contains(flag);
1328  }
1329  auto iterate_prefixes_of(const Key_Type& word) const
1330  {
1331  return table.iterate_prefixes_of(word);
1332  }
1333  auto iterate_prefixes_of(Key_Type&& word) const = delete;
1334 };
1335 
1337  using Suffix_Multiset_Type =
1340  using Key_Type = typename Suffix_Multiset_Type::Key_Type;
1341  using Vector_Type = typename Suffix_Multiset_Type::Vector_Type;
1342  Suffix_Multiset_Type table;
1343  Flag_Set all_cont_flags;
1344 
1345  auto populate()
1346  {
1347  for (auto& x : table.data())
1348  all_cont_flags += x.cont_flags;
1349  }
1350 
1351  public:
1352  Suffix_Table() = default;
1353  explicit Suffix_Table(const Vector_Type& t) : table(t) { populate(); }
1354  explicit Suffix_Table(Vector_Type&& t) : table(std::move(t))
1355  {
1356  populate();
1357  }
1358  auto& operator=(const Vector_Type& t)
1359  {
1360  table = t;
1361  populate();
1362  return *this;
1363  }
1364  auto& operator=(Vector_Type&& t)
1365  {
1366  table = std::move(t);
1367  populate();
1368  return *this;
1369  }
1370  auto begin() const { return table.data().begin(); }
1371  auto end() const { return table.data().end(); }
1372 
1373  auto has_continuation_flags() const
1374  {
1375  return all_cont_flags.size() != 0;
1376  }
1377  auto has_continuation_flag(char16_t flag) const
1378  {
1379  return all_cont_flags.contains(flag);
1380  }
1381  auto iterate_suffixes_of(const Key_Type& word) const
1382  {
1383  return table.iterate_prefixes_of(word);
1384  }
1385  auto iterate_suffixes_of(Key_Type&& word) const = delete;
1386 };
1387 
1392 template <class CharT>
1394  using Str = std::basic_string<CharT>;
1395  using Str_View = std::basic_string_view<CharT>;
1396  size_t i = 0;
1397  Str s;
1398 
1399  public:
1400  String_Pair() = default;
1401  template <class Str1>
1402  explicit String_Pair(Str1&& str, size_t i)
1403  : i(i), s(std::forward<Str1>(str))
1404  {
1405  if (i > s.size())
1406  throw std::out_of_range("word split is too long");
1407  }
1408 
1409  template <class Str1, class Str2,
1410  class = std::enable_if_t<
1411  std::is_same<std::remove_reference_t<Str1>, Str>::value &&
1412  std::is_same<std::remove_reference_t<Str2>, Str>::value>>
1413  String_Pair(Str1&& first, Str2&& second)
1414  : i(first.size()) /* must be init before s, before we move
1415  from first */
1416  ,
1417  s(std::forward<Str1>(first) + std::forward<Str2>(second))
1418 
1419  {
1420  }
1421  auto first() const { return Str_View(s).substr(0, i); }
1422  auto second() const { return Str_View(s).substr(i); }
1423  auto first(Str_View x)
1424  {
1425  s.replace(0, i, x);
1426  i = x.size();
1427  }
1428  auto second(Str_View x) { s.replace(i, s.npos, x); }
1429  auto& str() const { return s; }
1430  auto idx() const { return i; }
1431 };
1432 template <class CharT>
1434  using StrT = std::basic_string<CharT>;
1435 
1436  String_Pair<CharT> begin_end_chars;
1437  StrT replacement;
1438  char16_t first_word_flag = 0;
1439  char16_t second_word_flag = 0;
1440  bool match_first_only_unaffixed_or_zero_affixed = false;
1441 };
1442 
1444  std::vector<std::u16string> rules;
1445  Flag_Set all_flags;
1446 
1447  auto fill_all_flags() -> void;
1448 
1449  public:
1450  Compound_Rule_Table() = default;
1451  explicit Compound_Rule_Table(const std::vector<std::u16string>& tbl)
1452  : rules(tbl)
1453  {
1454  fill_all_flags();
1455  }
1456  explicit Compound_Rule_Table(std::vector<std::u16string>&& tbl)
1457  : rules(move(tbl))
1458  {
1459  fill_all_flags();
1460  }
1461  auto& operator=(const std::vector<std::u16string>& tbl)
1462  {
1463  rules = tbl;
1464  fill_all_flags();
1465  return *this;
1466  }
1467  auto& operator=(std::vector<std::u16string>&& tbl)
1468  {
1469  rules = move(tbl);
1470  fill_all_flags();
1471  return *this;
1472  }
1473  auto empty() const { return rules.empty(); }
1474  auto has_any_of_flags(const Flag_Set& f) const -> bool;
1475  auto match_any_rule(const std::vector<const Flag_Set*>& data) const
1476  -> bool;
1477 };
1478 auto inline Compound_Rule_Table::fill_all_flags() -> void
1479 {
1480  for (auto& f : rules) {
1481  all_flags += f;
1482  }
1483  all_flags.erase(u'?');
1484  all_flags.erase(u'*');
1485 }
1486 
1487 auto inline Compound_Rule_Table::has_any_of_flags(const Flag_Set& f) const
1488  -> bool
1489 {
1490  using std::begin;
1491  using std::end;
1492  struct Out_Iter_One_Bool {
1493  bool* value = nullptr;
1494  auto& operator++() { return *this; }
1495  auto& operator++(int) { return *this; }
1496  auto& operator*() { return *this; }
1497  auto& operator=(char16_t)
1498  {
1499  *value = true;
1500  return *this;
1501  }
1502  };
1503  auto has_intersection = false;
1504  auto out_it = Out_Iter_One_Bool{&has_intersection};
1505  std::set_intersection(begin(all_flags), end(all_flags), begin(f),
1506  end(f), out_it);
1507  return has_intersection;
1508 }
1509 
1510 template <class DataIter, class PatternIter, class FuncEq = std::equal_to<>>
1511 auto match_simple_regex(DataIter data_first, DataIter data_last,
1512  PatternIter pat_first, PatternIter pat_last,
1513  FuncEq eq = FuncEq())
1514 {
1515  auto s = std::stack<std::pair<DataIter, PatternIter>>();
1516  s.emplace(data_first, pat_first);
1517  auto data_it = DataIter();
1518  auto pat_it = PatternIter();
1519  while (!s.empty()) {
1520  std::tie(data_it, pat_it) = s.top();
1521  s.pop();
1522  if (pat_it == pat_last) {
1523  if (data_it == data_last)
1524  return true;
1525  else
1526  return false;
1527  }
1528  auto node_type = *pat_it;
1529  if (pat_it + 1 == pat_last)
1530  node_type = 0;
1531  else
1532  node_type = *(pat_it + 1);
1533  switch (node_type) {
1534  case '?':
1535  s.emplace(data_it, pat_it + 2);
1536  if (data_it != data_last && eq(*data_it, *pat_it))
1537  s.emplace(data_it + 1, pat_it + 2);
1538  break;
1539  case '*':
1540  s.emplace(data_it, pat_it + 2);
1541  if (data_it != data_last && eq(*data_it, *pat_it))
1542  s.emplace(data_it + 1, pat_it);
1543 
1544  break;
1545  default:
1546  if (data_it != data_last && eq(*data_it, *pat_it))
1547  s.emplace(data_it + 1, pat_it + 1);
1548  break;
1549  }
1550  }
1551  return false;
1552 }
1553 
1554 template <class DataRange, class PatternRange, class FuncEq = std::equal_to<>>
1555 auto match_simple_regex(const DataRange& data, const PatternRange& pattern,
1556  FuncEq eq = FuncEq())
1557 {
1558  using namespace std;
1559  return match_simple_regex(begin(data), end(data), begin(pattern),
1560  end(pattern), eq);
1561 }
1562 
1563 auto inline match_compund_rule(const std::vector<const Flag_Set*>& words_data,
1564  const std::u16string& pattern)
1565 {
1566  return match_simple_regex(
1567  words_data, pattern,
1568  [](const Flag_Set* d, char16_t p) { return d->contains(p); });
1569 }
1570 
1571 auto inline Compound_Rule_Table::match_any_rule(
1572  const std::vector<const Flag_Set*>& data) const -> bool
1573 {
1574  return any_of(begin(rules), end(rules), [&](const std::u16string& p) {
1575  return match_compund_rule(data, p);
1576  });
1577 }
1578 
1579 template <class CharT>
1581  using value_type = CharT;
1582  using traits_type = std::char_traits<value_type>;
1583  using size_type = size_t;
1584  using Str_View = std::basic_string_view<value_type>;
1585 
1586  static constexpr size_type short_capacity = 22;
1587  size_type sz = {};
1588  bool is_long = {};
1589  union {
1590  value_type short_data[short_capacity + 1];
1591  value_type* long_data;
1592  };
1593 
1594  auto is_short() const -> bool { return !is_long; }
1595 
1596  public:
1597  Simple_Short_String() { short_data[0] = {}; }
1598  Simple_Short_String(Str_View other)
1599  : sz(other.size()), is_long(sz > short_capacity)
1600  {
1601  value_type* ptr;
1602  if (is_short()) {
1603  ptr = short_data;
1604  }
1605  else {
1606  long_data = new value_type[sz + 1];
1607  ptr = long_data;
1608  }
1609  traits_type::copy(ptr, other.data(), sz);
1610  ptr[sz] = {};
1611  }
1613  {
1614  if (is_long)
1615  delete[] long_data;
1616  }
1617 
1618  operator Str_View() const noexcept
1619  {
1620  if (is_short())
1621  return {short_data, sz};
1622  return {long_data, sz};
1623  }
1624 
1626  : Simple_Short_String(Str_View(other))
1627  {
1628  }
1629  Simple_Short_String(Simple_Short_String&& other) noexcept
1630  : sz(other.sz), is_long(other.is_long)
1631  {
1632  if (is_short()) {
1633  traits_type::copy(short_data, other.short_data, sz);
1634  short_data[sz] = {};
1635  return;
1636  }
1637  long_data = other.long_data;
1638  other.sz = 0;
1639  other.is_long = false;
1640  other.short_data[0] = {};
1641  }
1642  auto operator=(const Simple_Short_String&)
1643  -> Simple_Short_String& = delete;
1644  auto operator=(Simple_Short_String &&) -> Simple_Short_String& = delete;
1645 
1646  auto size() const noexcept { return sz; }
1647 };
1649 
1654 template <class CharT>
1656  using Vec_Str = std::vector<std::basic_string<CharT>>;
1657  Vec_Str d;
1658  size_t sz = 0;
1659 
1660  public:
1661  using value_type = typename Vec_Str::value_type;
1662  using allocator_type = typename Vec_Str::allocator_type;
1663  using size_type = typename Vec_Str::size_type;
1664  using difference_type = typename Vec_Str::difference_type;
1665  using reference = typename Vec_Str::reference;
1666  using const_reference = typename Vec_Str::const_reference;
1667  using pointer = typename Vec_Str::pointer;
1668  using const_pointer = typename Vec_Str::const_pointer;
1669  using iterator = typename Vec_Str::iterator;
1670  using const_iterator = typename Vec_Str::const_iterator;
1671  using reverse_iterator = typename Vec_Str::reverse_iterator;
1672  using const_reverse_iterator = typename Vec_Str::const_reverse_iterator;
1673 
1674  List_Basic_Strings() = default;
1675  explicit List_Basic_Strings(size_type n) : d(n), sz(n) {}
1676  List_Basic_Strings(size_type n, const_reference value)
1677  : d(n, value), sz(n)
1678  {
1679  }
1680  template <class InputIterator>
1681  List_Basic_Strings(InputIterator first, InputIterator last)
1682  : d(first, last), sz(d.size())
1683  {
1684  }
1685  List_Basic_Strings(std::initializer_list<value_type> il)
1686  : d(il), sz(d.size())
1687  {
1688  }
1689 
1690  List_Basic_Strings(const List_Basic_Strings& other) = default;
1691  List_Basic_Strings(List_Basic_Strings&& other) noexcept
1692  : d(move(other.d)), sz(other.sz)
1693  {
1694  other.sz = 0;
1695  }
1696 
1697  List_Basic_Strings(Vec_Str&& other) : d(other), sz(other.size()) {}
1698 
1699  auto& operator=(const List_Basic_Strings& other)
1700  {
1701  clear();
1702  insert(begin(), other.begin(), other.end());
1703  return *this;
1704  }
1705  auto& operator=(List_Basic_Strings&& other) noexcept
1706  {
1707  d = move(other.d);
1708  sz = other.sz;
1709  other.sz = 0;
1710  return *this;
1711  }
1712  auto& operator=(std::initializer_list<value_type> il)
1713  {
1714  clear();
1715  insert(begin(), il);
1716  return *this;
1717  }
1718  template <class InputIterator>
1719  auto assign(InputIterator first, InputIterator last)
1720  {
1721  clear();
1722  insert(begin(), first, last);
1723  }
1724  void assign(size_type n, const_reference value)
1725  {
1726  clear();
1727  insert(begin(), n, value);
1728  }
1729  void assign(std::initializer_list<value_type> il) { *this = il; }
1730  auto get_allocator() const noexcept { return d.get_allocator(); }
1731 
1732  // iterators
1733  auto begin() noexcept -> iterator { return d.begin(); }
1734  auto begin() const noexcept -> const_iterator { return d.begin(); }
1735  auto end() noexcept -> iterator { return begin() + sz; }
1736  auto end() const noexcept -> const_iterator { return begin() + sz; }
1737 
1738  auto rbegin() noexcept { return d.rend() - sz; }
1739  auto rbegin() const noexcept { return d.rend() - sz; }
1740  auto rend() noexcept { return d.rend(); }
1741  auto rend() const noexcept { return d.rend(); }
1742 
1743  auto cbegin() const noexcept { return d.cbegin(); }
1744  auto cend() const noexcept { return cbegin() + sz; }
1745 
1746  auto crbegin() const noexcept { return d.crend() - sz; }
1747  auto crend() const noexcept { return d.crend(); }
1748 
1749  // [vector.capacity], capacity
1750  auto empty() const noexcept { return sz == 0; }
1751  auto size() const noexcept { return sz; }
1752  auto max_size() const noexcept { return d.max_size(); }
1753  auto capacity() const noexcept { return d.size(); }
1754  auto resize(size_type new_sz)
1755  {
1756  if (new_sz <= sz) {
1757  }
1758  else if (new_sz <= d.size()) {
1759  std::for_each(begin() + sz, begin() + new_sz,
1760  [](auto& s) { s.clear(); });
1761  }
1762  else {
1763  std::for_each(d.begin() + sz, d.end(),
1764  [](auto& s) { s.clear(); });
1765  d.resize(new_sz);
1766  }
1767  sz = new_sz;
1768  }
1769  auto resize(size_type new_sz, const_reference c)
1770  {
1771  if (new_sz <= sz) {
1772  }
1773  else if (new_sz <= d.size()) {
1774  std::for_each(begin() + sz, begin() + new_sz,
1775  [&](auto& s) { s = c; });
1776  }
1777  else {
1778  std::for_each(d.begin() + sz, d.end(),
1779  [&](auto& s) { s = c; });
1780  d.resize(new_sz, c);
1781  }
1782  sz = new_sz;
1783  }
1784  void reserve(size_type n)
1785  {
1786  if (n > d.size())
1787  d.resize(n);
1788  }
1789  void shrink_to_fit()
1790  {
1791  d.resize(sz);
1792  d.shrink_to_fit();
1793  for (auto& s : d) {
1794  s.shrink_to_fit();
1795  }
1796  }
1797 
1798  // element access
1799  auto& operator[](size_type n) { return d[n]; }
1800  auto& operator[](size_type n) const { return d[n]; }
1801  auto& at(size_type n)
1802  {
1803  if (n < sz)
1804  return d[n];
1805  else
1806  throw std::out_of_range("index is out of range");
1807  }
1808  auto& at(size_type n) const
1809  {
1810  if (n < sz)
1811  return d[n];
1812  else
1813  throw std::out_of_range("index is out of range");
1814  }
1815  auto& front() { return d.front(); }
1816  auto& front() const { return d.front(); }
1817  auto& back() { return d[sz - 1]; }
1818  auto& back() const { return d[sz - 1]; }
1819 
1820  // [vector.data], data access
1821  auto data() noexcept { return d.data(); }
1822  auto data() const noexcept { return d.data(); }
1823 
1824  // [vector.modifiers], modifiers
1825  template <class... Args>
1826  auto& emplace_back(Args&&... args)
1827  {
1828  if (sz != d.size())
1829  d[sz] = value_type(std::forward<Args>(args)...);
1830  else
1831  d.emplace_back(std::forward<Args>(args)...);
1832  return d[sz++];
1833  }
1834  auto& emplace_back()
1835  {
1836  if (sz != d.size())
1837  d[sz].clear();
1838  else
1839  d.emplace_back();
1840  return d[sz++];
1841  }
1842  auto push_back(const_reference x)
1843  {
1844  if (sz != d.size())
1845  d[sz] = x;
1846  else
1847  d.push_back(x);
1848  ++sz;
1849  }
1850  auto push_back(value_type&& x)
1851  {
1852  if (sz != d.size())
1853  d[sz] = std::move(x);
1854  else
1855  d.push_back(std::move(x));
1856  ++sz;
1857  }
1858  auto pop_back() { --sz; }
1859 
1860  private:
1861  template <class U>
1862  auto insert_priv(const_iterator pos, U&& val)
1863  {
1864  if (sz != d.size()) {
1865  d[sz] = std::forward<U>(val);
1866  }
1867  else {
1868  auto pos_idx = pos - cbegin();
1869  d.push_back(std::forward<U>(val));
1870  pos = cbegin() + pos_idx;
1871  }
1872  auto p = begin() + (pos - cbegin());
1873  std::rotate(p, begin() + sz, begin() + sz + 1);
1874  ++sz;
1875  return p;
1876  }
1877 
1878  public:
1879  template <class... Args>
1880  auto emplace(const_iterator pos, Args&&... args)
1881  {
1882  if (sz != d.size()) {
1883  d[sz] = value_type(std::forward<Args>(args)...);
1884  }
1885  else {
1886  auto pos_idx = pos - cbegin();
1887  d.emplace(std::forward<Args>(args)...);
1888  pos = cbegin() + pos_idx;
1889  }
1890  auto p = begin() + (pos - cbegin());
1891  std::rotate(p, begin() + sz, begin() + sz + 1);
1892  ++sz;
1893  return p;
1894  }
1895  auto insert(const_iterator pos, const_reference x)
1896  {
1897  return insert_priv(pos, x);
1898  }
1899  auto insert(const_iterator pos, value_type&& x)
1900  {
1901  return insert_priv(pos, std::move(x));
1902  }
1903  auto insert(const_iterator pos, size_type n, const_reference x)
1904  -> iterator
1905  {
1906  auto i = sz;
1907  while (n != 0 && i != d.size()) {
1908  d[i] = x;
1909  --n;
1910  ++i;
1911  }
1912  if (n != 0) {
1913  auto pos_idx = pos - cbegin();
1914  d.insert(d.end(), n, x);
1915  pos = cbegin() + pos_idx;
1916  i = d.size();
1917  }
1918  auto p = begin() + (pos - cbegin());
1919  std::rotate(p, begin() + sz, begin() + i);
1920  sz = i;
1921  return p;
1922  }
1923 
1924  template <class InputIterator>
1925  auto insert(const_iterator pos, InputIterator first, InputIterator last)
1926  -> iterator
1927  {
1928  auto i = sz;
1929  while (first != last && i != d.size()) {
1930  d[i] = *first;
1931  ++first;
1932  ++i;
1933  }
1934  if (first != last) {
1935  auto pos_idx = pos - cbegin();
1936  d.insert(d.end(), first, last);
1937  pos = cbegin() + pos_idx;
1938  i = d.size();
1939  }
1940  auto p = begin() + (pos - cbegin());
1941  std::rotate(p, begin() + sz, begin() + i);
1942  sz = i;
1943  return p;
1944  }
1945  auto insert(const_iterator pos, std::initializer_list<value_type> il)
1946  -> iterator
1947  {
1948  return insert(pos, il.begin(), il.end());
1949  }
1950 
1951  auto erase(const_iterator position)
1952  {
1953  auto i0 = begin();
1954  auto i1 = i0 + (position - cbegin());
1955  auto i2 = i1 + 1;
1956  auto i3 = end();
1957  std::rotate(i1, i2, i3);
1958  --sz;
1959  return i1;
1960  }
1961  auto erase(const_iterator first, const_iterator last)
1962  {
1963  auto i0 = begin();
1964  auto i1 = i0 + (first - cbegin());
1965  auto i2 = i0 + (last - cbegin());
1966  auto i3 = end();
1967  std::rotate(i1, i2, i3);
1968  sz -= last - first;
1969  return i1;
1970  }
1971  auto swap(List_Basic_Strings& other)
1972  {
1973  d.swap(other.d);
1974  std::swap(sz, other.sz);
1975  }
1976  auto clear() noexcept -> void { sz = 0; }
1977 
1978  auto operator==(const List_Basic_Strings& other) const
1979  {
1980  return std::equal(begin(), end(), other.begin(), other.end());
1981  }
1982  auto operator!=(const List_Basic_Strings& other) const
1983  {
1984  return !(*this == other);
1985  }
1986  auto operator<(const List_Basic_Strings& other) const
1987  {
1988  return std::lexicographical_compare(begin(), end(),
1989  other.begin(), other.end());
1990  }
1991  auto operator>=(const List_Basic_Strings& other) const
1992  {
1993  return !(*this < other);
1994  }
1995  auto operator>(const List_Basic_Strings& other) const
1996  {
1997  return std::lexicographical_compare(other.begin(), other.end(),
1998  begin(), end());
1999  }
2000  auto operator<=(const List_Basic_Strings& other) const
2001  {
2002  return !(*this > other);
2003  }
2004 
2005  auto extract_sequence() -> Vec_Str
2006  {
2007  d.resize(sz);
2008  sz = 0;
2009  return std::move(d);
2010  }
2011 };
2012 
2013 template <class CharT>
2015 {
2016  a.swap(b);
2017 }
2018 
2021 
2022 template <class CharT>
2024  public:
2025  using Str = std::basic_string<CharT>;
2026  using Table_Str = std::vector<std::pair<Str, Str>>;
2027  using iterator = typename Table_Str::iterator;
2028  using const_iterator = typename Table_Str::const_iterator;
2029 
2030  private:
2031  Table_Str table;
2032  size_t whole_word_reps_last_idx = 0;
2033  size_t start_word_reps_last_idx = 0;
2034  size_t end_word_reps_last_idx = 0;
2035 
2036  auto order_entries() -> void;
2037 
2038  public:
2039  Replacement_Table() = default;
2040  explicit Replacement_Table(const Table_Str& v) : table(v)
2041  {
2042  order_entries();
2043  }
2044  explicit Replacement_Table(Table_Str&& v) : table(move(v))
2045  {
2046  order_entries();
2047  }
2048 
2049  auto& operator=(const Table_Str& v)
2050  {
2051  table = v;
2052  order_entries();
2053  return *this;
2054  }
2055 
2056  auto& operator=(Table_Str&& v)
2057  {
2058  table = move(v);
2059  order_entries();
2060  return *this;
2061  }
2062 
2063  auto whole_word_replacements() const -> Subrange<const_iterator>
2064  {
2065  return {begin(table), begin(table) + whole_word_reps_last_idx};
2066  }
2067  auto start_word_replacements() const -> Subrange<const_iterator>
2068  {
2069  return {begin(table) + whole_word_reps_last_idx,
2070  begin(table) + start_word_reps_last_idx};
2071  }
2072  auto end_word_replacements() const -> Subrange<const_iterator>
2073  {
2074  return {begin(table) + start_word_reps_last_idx,
2075  begin(table) + end_word_reps_last_idx};
2076  }
2077  auto any_place_replacements() const -> Subrange<const_iterator>
2078  {
2079  return {begin(table) + end_word_reps_last_idx, end(table)};
2080  }
2081 };
2082 template <class CharT>
2084 {
2085  auto it = remove_if(begin(table), end(table), [](auto& p) {
2086  auto& s = p.first;
2087  return s.empty() ||
2088  (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
2089  });
2090  table.erase(it, end(table));
2091 
2092  auto is_start_word_pat = [=](auto& x) { return x.first[0] == '^'; };
2093  auto is_end_word_pat = [=](auto& x) { return x.first.back() == '$'; };
2094 
2095  auto start_word_reps_last =
2096  partition(begin(table), end(table), is_start_word_pat);
2097  start_word_reps_last_idx = start_word_reps_last - begin(table);
2098  for_each(begin(table), start_word_reps_last,
2099  [](auto& e) { e.first.erase(0, 1); });
2100 
2101  auto whole_word_reps_last =
2102  partition(begin(table), start_word_reps_last, is_end_word_pat);
2103  whole_word_reps_last_idx = whole_word_reps_last - begin(table);
2104  for_each(begin(table), whole_word_reps_last,
2105  [](auto& e) { e.first.pop_back(); });
2106 
2107  auto end_word_reps_last =
2108  partition(start_word_reps_last, end(table), is_end_word_pat);
2109  end_word_reps_last_idx = end_word_reps_last - begin(table);
2110  for_each(start_word_reps_last, end_word_reps_last,
2111  [](auto& e) { e.first.pop_back(); });
2112 }
2113 
2114 template <class CharT>
2116  using Str = std::basic_string<CharT>;
2117  Str chars;
2118  std::vector<Str> strings;
2119 
2120  auto parse(const Str& s) -> void;
2121  Similarity_Group() = default;
2122  explicit Similarity_Group(const Str& s) { parse(s); }
2123  auto& operator=(const Str& s)
2124  {
2125  parse(s);
2126  return *this;
2127  }
2128 };
2129 template <class CharT>
2130 auto Similarity_Group<CharT>::parse(const Str& s) -> void
2131 {
2132  auto i = size_t(0);
2133  for (;;) {
2134  auto j = s.find('(', i);
2135  chars.append(s, i, j - i);
2136  if (j == s.npos)
2137  break;
2138  i = j + 1;
2139  j = s.find(')', i);
2140  if (j == s.npos)
2141  break;
2142  auto len = j - i;
2143  if (len == 1)
2144  chars += s[i];
2145  else if (len > 1)
2146  strings.push_back(s.substr(i, len));
2147  i = j + 1;
2148  }
2149 }
2150 
2151 template <class CharT>
2153  using Str = std::basic_string<CharT>;
2154  using Pair_Str = std::pair<Str, Str>;
2155 
2156  struct Phonet_Match_Result {
2157  size_t count_matched = 0;
2158  size_t go_back_before_replace = 0;
2159  size_t priority = 5;
2160  bool go_back_after_replace = false;
2161  bool treat_next_as_begin = false;
2162  operator bool() { return count_matched; }
2163  };
2164 
2165  std::vector<std::pair<Str, Str>> table;
2166  auto order() -> void;
2167  auto static match(const Str& data, size_t i, const Str& pattern,
2168  bool at_begin) -> Phonet_Match_Result;
2169 
2170  public:
2171  Phonetic_Table() = default;
2172  explicit Phonetic_Table(const std::vector<Pair_Str>& v) : table(v)
2173  {
2174  order();
2175  }
2176  explicit Phonetic_Table(std::vector<Pair_Str>&& v) : table(move(v))
2177  {
2178  order();
2179  }
2180  auto& operator=(const std::vector<Pair_Str>& v)
2181  {
2182  table = v;
2183  order();
2184  return *this;
2185  }
2186  auto& operator=(std::vector<Pair_Str>&& v)
2187  {
2188  table = move(v);
2189  order();
2190  return *this;
2191  }
2192  auto replace(Str& word) const -> bool;
2193 };
2194 
2195 template <class CharT>
2196 auto Phonetic_Table<CharT>::order() -> void
2197 {
2198  stable_sort(begin(table), end(table), [](auto& pair1, auto& pair2) {
2199  if (pair2.first.empty())
2200  return false;
2201  if (pair1.first.empty())
2202  return true;
2203  return pair1.first[0] < pair2.first[0];
2204  });
2205  auto it = find_if_not(begin(table), end(table),
2206  [](auto& p) { return p.first.empty(); });
2207  table.erase(begin(table), it);
2208  for (auto& r : table) {
2209  if (r.second == NUSPELL_LITERAL(CharT, "_"))
2210  r.second.clear();
2211  }
2212 }
2213 
2214 template <class CharT>
2215 auto Phonetic_Table<CharT>::match(const Str& data, size_t i, const Str& pattern,
2216  bool at_begin) -> Phonet_Match_Result
2217 {
2218  auto ret = Phonet_Match_Result();
2219  auto j =
2220  pattern.find_first_of(NUSPELL_LITERAL(CharT, "(<-0123456789^$"));
2221  if (j == pattern.npos)
2222  j = pattern.size();
2223  if (data.compare(i, j, pattern, 0, j) == 0)
2224  ret.count_matched = j;
2225  else
2226  return {};
2227  if (j == pattern.size())
2228  return ret;
2229  if (pattern[j] == '(') {
2230  auto k = pattern.find(')', j);
2231  if (k == pattern.npos)
2232  return {}; // bad rule
2233  auto x = std::char_traits<CharT>::find(
2234  &pattern[j + 1], k - (j + 1), data[i + j]);
2235  if (!x)
2236  return {};
2237  j = k + 1;
2238  ret.count_matched += 1;
2239  }
2240  if (j == pattern.size())
2241  return ret;
2242  if (pattern[j] == '<') {
2243  ret.go_back_after_replace = true;
2244  ++j;
2245  }
2246  auto k = pattern.find_first_not_of('-', j);
2247  if (k == pattern.npos) {
2248  k = pattern.size();
2249  ret.go_back_before_replace = k - j;
2250  if (ret.go_back_before_replace >= ret.count_matched)
2251  return {}; // bad rule
2252  return ret;
2253  }
2254  else {
2255  ret.go_back_before_replace = k - j;
2256  if (ret.go_back_before_replace >= ret.count_matched)
2257  return {}; // bad rule
2258  }
2259  j = k;
2260  if (pattern[j] >= '0' && pattern[j] <= '9') {
2261  ret.priority = pattern[j] - '0';
2262  ++j;
2263  }
2264  if (j == pattern.size())
2265  return ret;
2266  if (pattern[j] == '^') {
2267  if (!at_begin)
2268  return {};
2269  ++j;
2270  }
2271  if (j == pattern.size())
2272  return ret;
2273  if (pattern[j] == '^') {
2274  ret.treat_next_as_begin = true;
2275  ++j;
2276  }
2277  if (j == pattern.size())
2278  return ret;
2279  if (pattern[j] != '$')
2280  return {}; // bad rule, no other char is allowed at this point
2281  if (i + ret.count_matched == data.size())
2282  return ret;
2283  return {};
2284 }
2285 
2286 template <class CharT>
2287 auto Phonetic_Table<CharT>::replace(Str& word) const -> bool
2288 {
2289  struct Cmp {
2290  auto operator()(CharT c, const Pair_Str& s)
2291  {
2292  return c < s.first[0];
2293  }
2294  auto operator()(const Pair_Str& s, CharT c)
2295  {
2296  return s.first[0] < c;
2297  }
2298  };
2299  if (table.empty())
2300  return false;
2301  auto ret = false;
2302  auto treat_next_as_begin = true;
2303  size_t count_go_backs_after_replace = 0; // avoid infinite loop
2304  for (size_t i = 0; i != word.size(); ++i) {
2305  auto rules =
2306  equal_range(begin(table), end(table), word[i], Cmp());
2307  for (auto& r : Subrange(rules)) {
2308  auto rule = &r;
2309  auto m1 = match(word, i, r.first, treat_next_as_begin);
2310  if (!m1)
2311  continue;
2312  if (!m1.go_back_before_replace) {
2313  auto j = i + m1.count_matched - 1;
2314  auto rules2 = equal_range(
2315  begin(table), end(table), word[j], Cmp());
2316  for (auto& r2 : Subrange(rules2)) {
2317  auto m2 =
2318  match(word, j, r2.first, false);
2319  if (m2 && m2.priority >= m1.priority) {
2320  i = j;
2321  rule = &r2;
2322  m1 = m2;
2323  break;
2324  }
2325  }
2326  }
2327  word.replace(
2328  i, m1.count_matched - m1.go_back_before_replace,
2329  rule->second);
2330  treat_next_as_begin = m1.treat_next_as_begin;
2331  if (m1.go_back_after_replace &&
2332  count_go_backs_after_replace < 100) {
2333  count_go_backs_after_replace++;
2334  }
2335  else {
2336  i += rule->second.size();
2337  }
2338  --i;
2339  ret = true;
2340  break;
2341  }
2342  }
2343  return ret;
2344 }
2345 } // namespace v4
2346 } // namespace nuspell
2347 #endif // NUSPELL_STRUCTURES_HXX
nuspell::v4::Prefix_Table
Definition: structures.hxx:1284
nuspell::v4::Replacement_Table
Definition: structures.hxx:2023
nuspell::v4::String_Set
Definition: structures.hxx:82
nuspell::v4::String_Pair
Definition: structures.hxx:1393
nuspell::v4::Substr_Replacer
Definition: structures.hxx:326
nuspell::v4::Extractor_Of_Appending_From_Affix
Definition: structures.hxx:1271
nuspell::v4::Condition_Exception
Definition: structures.hxx:674
nuspell::v4::Condition
Definition: structures.hxx:683
nuspell
Library main namespace.
Definition: aff_data.cxx:31
nuspell::v4::Similarity_Group
Definition: structures.hxx:2115
nuspell::v4::identity
Definition: structures.hxx:531
nuspell::v4::Break_Table
Definition: structures.hxx:454
nuspell::v4::Simple_Short_String
Definition: structures.hxx:1580
nuspell::v4::Prefix
Definition: structures.hxx:850
nuspell::v4::Affix_Table
Definition: structures.hxx:1281
nuspell::v4::Suffix
Definition: structures.hxx:889
nuspell::v4::Phonetic_Table
Definition: structures.hxx:2152
nuspell::v4::Hash_Multiset
Definition: structures.hxx:540
nuspell::v4::Prefix_Multiset
Definition: structures.hxx:930
nuspell::v4::Compound_Pattern
Definition: structures.hxx:1433
nuspell::v4::Prefix_Multiset::Iter_Prefixes_Of
Definition: structures.hxx:1059
nuspell::v4::Reversed_String_View
Definition: structures.hxx:1230
nuspell::v4::Compound_Rule_Table
Definition: structures.hxx:1443
nuspell::v4::Subrange
Definition: structures.hxx:51
nuspell::v4::String_Reverser
Definition: structures.hxx:1262
nuspell::v4::Suffix_Table
Definition: structures.hxx:1336
nuspell::v4::List_Basic_Strings
Definition: structures.hxx:1655