Nuspell
spellchecker
structures.hxx
Go to the documentation of this file.
1 /* Copyright 2018 Dimitrij Mijoski, Sander van Geloven
2  *
3  * This file is part of Nuspell.
4  *
5  * Nuspell is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Nuspell is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
24 #ifndef NUSPELL_STRUCTURES_HXX
25 #define NUSPELL_STRUCTURES_HXX
26 
27 #include "string_utils.hxx"
28 
29 #include <algorithm>
30 #include <cmath>
31 #include <iterator>
32 #include <stdexcept>
33 #include <string>
34 #include <tuple>
35 #include <utility>
36 #include <vector>
37 
38 #include <boost/container/small_vector.hpp>
39 #include <boost/multi_index/member.hpp>
40 #include <boost/range/iterator_range_core.hpp>
41 
42 namespace nuspell {
43 
49 template <class CharT>
50 class String_Set {
51  private:
52  std::basic_string<CharT> d;
53  auto sort_uniq() -> void
54  {
55  auto first = begin();
56  auto last = end();
57  using t = traits_type;
58  sort(first, last, t::lt);
59  d.erase(unique(first, last, t::eq), last);
60  }
61  struct Char_Traits_Less_Than {
62  auto operator()(CharT a, CharT b)
63  {
64  return traits_type::lt(a, b);
65  }
66  };
67 
68  public:
69  using StrT = std::basic_string<CharT>;
70  using traits_type = typename StrT::traits_type;
71 
72  using key_type = typename StrT::value_type;
73  using key_compare = Char_Traits_Less_Than;
74  using value_type = typename StrT::value_type;
75  using value_compare = key_compare;
76  using allocator_type = typename StrT::allocator_type;
77  using pointer = typename StrT::pointer;
78  using const_pointer = typename StrT::const_pointer;
79  using reference = typename StrT::reference;
80  using const_reference = typename StrT::const_reference;
81  using size_type = typename StrT::size_type;
82  using difference_type = typename StrT::difference_type;
83  using iterator = typename StrT::iterator;
84  using const_iterator = typename StrT::const_iterator;
85  using reverse_iterator = typename StrT::reverse_iterator;
86  using const_reverse_iterator = typename StrT::const_reverse_iterator;
87 
88  String_Set() = default;
89  String_Set(const StrT& s) : d(s) { sort_uniq(); }
90  String_Set(StrT&& s) : d(move(s)) { sort_uniq(); }
91  String_Set(const CharT* s) : d(s) { sort_uniq(); }
92  template <class InputIterator>
93  String_Set(InputIterator first, InputIterator last) : d(first, last)
94  {
95  sort_uniq();
96  }
97  String_Set(std::initializer_list<value_type> il) : d(il)
98  {
99  sort_uniq();
100  }
101 
102  auto& operator=(const StrT& s)
103  {
104  d = s;
105  sort_uniq();
106  return *this;
107  }
108  auto& operator=(StrT&& s)
109  {
110  d = move(s);
111  sort_uniq();
112  return *this;
113  }
114  auto& operator=(std::initializer_list<value_type> il)
115  {
116  d = il;
117  sort_uniq();
118  return *this;
119  }
120  auto& operator=(const CharT* s)
121  {
122  d = s;
123  sort_uniq();
124  return *this;
125  }
126 
127  // non standard underlying storage access:
128  auto& data() const { return d; }
129  operator const StrT&() const noexcept { return d; }
130 
131  // iterators:
132  iterator begin() noexcept { return d.begin(); }
133  const_iterator begin() const noexcept { return d.begin(); }
134  iterator end() noexcept { return d.end(); }
135  const_iterator end() const noexcept { return d.end(); }
136 
137  reverse_iterator rbegin() noexcept { return d.rbegin(); }
138  const_reverse_iterator rbegin() const noexcept { return d.rbegin(); }
139  reverse_iterator rend() noexcept { return d.rend(); }
140  const_reverse_iterator rend() const noexcept { return d.rend(); }
141 
142  const_iterator cbegin() const noexcept { return d.cbegin(); }
143  const_iterator cend() const noexcept { return d.cend(); }
144  const_reverse_iterator crbegin() const noexcept { return d.crbegin(); }
145  const_reverse_iterator crend() const noexcept { return d.crend(); }
146 
147  // capacity:
148  bool empty() const noexcept { return d.empty(); }
149  size_type size() const noexcept { return d.size(); }
150  size_type max_size() const noexcept { return d.max_size(); }
151 
152  // modifiers:
153  std::pair<iterator, bool> insert(const value_type& x)
154  {
155  auto it = lower_bound(x);
156  if (it != end() && *it == x) {
157  return {it, false};
158  }
159  auto ret = d.insert(it, x);
160  return {ret, true};
161  }
162  // std::pair<iterator, bool> insert(value_type&& x);
163  iterator insert(iterator hint, const value_type& x)
164  {
165  if (hint == end() || traits_type::lt(x, *hint)) {
166  if (hint == begin() ||
167  traits_type::lt(*(hint - 1), x)) {
168  return d.insert(hint, x);
169  }
170  }
171  return insert(x).first;
172  }
173 
174  // iterator insert(const_iterator hint, value_type&& x);
175  template <class InputIterator>
176  void insert(InputIterator first, InputIterator last)
177  {
178  d.insert(end(), first, last);
179  sort_uniq();
180  }
181  void insert(std::initializer_list<value_type> il)
182  {
183  d.insert(end(), il);
184  sort_uniq();
185  }
186  template <class... Args>
187  std::pair<iterator, bool> emplace(Args&&... args)
188  {
189  return insert(CharT(args...));
190  }
191 
192  template <class... Args>
193  iterator emplace_hint(iterator hint, Args&&... args)
194  {
195  return insert(hint, CharT(args...));
196  }
197 
198  iterator erase(iterator position) { return d.erase(position); }
199  size_type erase(const key_type& x)
200  {
201  auto i = d.find(x);
202  if (i != d.npos) {
203  d.erase(i, 1);
204  return true;
205  }
206  return false;
207  }
208  iterator erase(iterator first, iterator last)
209  {
210  return d.erase(first, last);
211  }
212  void swap(String_Set& s) { d.swap(s.d); }
213  void clear() noexcept { d.clear(); }
214 
215  // non standrd modifiers:
216  auto insert(const StrT& s) -> void
217  {
218  d += s;
219  sort_uniq();
220  }
221  auto operator+=(const StrT& s) -> String_Set
222  {
223  insert(s);
224  return *this;
225  }
226 
227  // observers:
228  key_compare key_comp() const { return Char_Traits_Less_Than(); }
229  value_compare value_comp() const { return key_comp(); }
230 
231  // set operations:
232  private:
233  auto lookup(const key_type& x) const
234  {
235  auto i = d.find(x);
236  if (i != d.npos)
237  i = d.size();
238  return i;
239  }
240 
241  public:
242  iterator find(const key_type& x) { return begin() + lookup(x); }
243  const_iterator find(const key_type& x) const
244  {
245  return begin() + lookup(x);
246  }
247  size_type count(const key_type& x) const { return d.find(x) != d.npos; }
248 
249  iterator lower_bound(const key_type& x)
250  {
251  return std::lower_bound(begin(), end(), x, traits_type::lt);
252  }
253 
254  const_iterator lower_bound(const key_type& x) const
255  {
256  return std::lower_bound(begin(), end(), x, traits_type::lt);
257  }
258  iterator upper_bound(const key_type& x)
259  {
260  return std::upper_bound(begin(), end(), x, traits_type::lt);
261  }
262 
263  const_iterator upper_bound(const key_type& x) const
264  {
265  return std::upper_bound(begin(), end(), x, traits_type::lt);
266  }
267  std::pair<iterator, iterator> equal_range(const key_type& x)
268  {
269  return std::equal_range(begin(), end(), x, traits_type::lt);
270  }
271 
272  std::pair<const_iterator, const_iterator>
273  equal_range(const key_type& x) const
274  {
275  return std::equal_range(begin(), end(), x, traits_type::lt);
276  }
277 
278  // non standard set operations:
279  bool contains(const key_type& x) const { return count(x); }
280 
281  // compare
282  bool operator<(const String_Set& rhs) const { return d < rhs.d; }
283  bool operator<=(const String_Set& rhs) const { return d <= rhs.d; }
284  bool operator==(const String_Set& rhs) const { return d == rhs.d; }
285  bool operator!=(const String_Set& rhs) const { return d != rhs.d; }
286  bool operator>=(const String_Set& rhs) const { return d >= rhs.d; }
287  bool operator>(const String_Set& rhs) const { return d > rhs.d; }
288 };
289 
290 template <class CharT>
291 auto swap(String_Set<CharT>& a, String_Set<CharT>& b)
292 {
293  a.swap(b);
294 }
295 
297 
298 template <class CharT>
300  public:
301  using StrT = std::basic_string<CharT>;
302  using StrViewT = my_string_view<CharT>;
303  using Pair_StrT = std::pair<StrT, StrT>;
304  using Table_Pairs = std::vector<Pair_StrT>;
305 
306  private:
307  Table_Pairs table;
308  auto sort_uniq() -> void; // implemented in cxx
309  auto find_match(my_string_view<CharT> s) const;
310 
311  public:
312  Substr_Replacer() = default;
313  Substr_Replacer(const Table_Pairs& v) : table(v) { sort_uniq(); }
314  Substr_Replacer(const Table_Pairs&& v) : table(move(v)) { sort_uniq(); }
315 
316  auto& operator=(const Table_Pairs& v)
317  {
318  table = v;
319  sort_uniq();
320  return *this;
321  }
322  auto& operator=(const Table_Pairs&& v)
323  {
324  table = move(v);
325  sort_uniq();
326  return *this;
327  }
328 
329  template <class Range>
330  auto& operator=(const Range& range)
331  {
332  table.assign(std::begin(range), std::end(range));
333  sort_uniq();
334  return *this;
335  }
336 
337  auto replace(StrT& s) const -> StrT&; // implemented in cxx
338  auto replace_copy(StrT s) const -> StrT
339  {
340  replace(s);
341  return s;
342  }
343 };
344 template <class CharT>
346 {
347  auto first = begin(table);
348  auto last = end(table);
349  sort(first, last, [](auto& a, auto& b) { return a.first < b.first; });
350  auto it = unique(first, last,
351  [](auto& a, auto& b) { return a.first == b.first; });
352  table.erase(it, last);
353 
354  // remove empty key ""
355  if (!table.empty() && table.front().first.empty())
356  table.erase(begin(table));
357 }
358 
359 template <class CharT>
360 auto Substr_Replacer<CharT>::find_match(my_string_view<CharT> s) const
361 {
362  auto& t = table;
363  struct Comparer_Str_Rep {
364  auto static cmp_prefix_of(const StrT& p, StrViewT of)
365  {
366  return p.compare(0, p.npos, of.data(),
367  std::min(p.size(), of.size()));
368  }
369  auto operator()(const Pair_StrT& a, StrViewT b)
370  {
371  return cmp_prefix_of(a.first, b) < 0;
372  }
373  auto operator()(StrViewT a, const Pair_StrT& b)
374  {
375  return cmp_prefix_of(b.first, a) > 0;
376  }
377  auto static eq(const Pair_StrT& a, StrViewT b)
378  {
379  return cmp_prefix_of(a.first, b) == 0;
380  }
381  };
382  Comparer_Str_Rep csr;
383  auto it = begin(t);
384  auto last_match = end(t);
385  for (;;) {
386  auto it2 = upper_bound(it, end(t), s, csr);
387  if (it2 == it) {
388  // not found, s is smaller that the range
389  break;
390  }
391  --it2;
392  if (csr.eq(*it2, s)) {
393  // Match found. Try another search maybe for
394  // longer.
395  last_match = it2;
396  it = ++it2;
397  }
398  else {
399  // not found, s is greater that the range
400  break;
401  }
402  }
403  return last_match;
404 }
405 
406 template <class CharT>
407 auto Substr_Replacer<CharT>::replace(StrT& s) const -> StrT&
408 {
409 
410  if (table.empty())
411  return s;
412  for (size_t i = 0; i < s.size(); /*no increment here*/) {
413  auto substr = my_string_view<CharT>(&s[i], s.size() - i);
414  auto it = find_match(substr);
415  if (it != end(table)) {
416  auto& match = *it;
417  // match found. match.first is the found string,
418  // match.second is the replacement.
419  s.replace(i, match.first.size(), match.second);
420  i += match.second.size();
421  continue;
422  }
423  ++i;
424  }
425  return s;
426 }
427 
428 template <class CharT>
429 class Break_Table {
430  public:
431  using StrT = std::basic_string<CharT>;
432  using Table_Str = std::vector<StrT>;
433  using iterator = typename Table_Str::iterator;
434  using const_iterator = typename Table_Str::const_iterator;
435 
436  private:
437  Table_Str table;
438  size_t start_word_breaks_last_idx = 0;
439  size_t end_word_breaks_last_idx = 0;
440 
441  auto order_entries() -> void; // implemented in cxx
442 
443  public:
444  Break_Table() = default;
445  Break_Table(const Table_Str& v) : table(v) { order_entries(); }
446  Break_Table(Table_Str&& v) : table(move(v)) { order_entries(); }
447 
448  auto& operator=(const Table_Str& v)
449  {
450  table = v;
451  order_entries();
452  return *this;
453  }
454 
455  auto& operator=(Table_Str&& v)
456  {
457  table = move(v);
458  order_entries();
459  return *this;
460  }
461 
462  template <class Range>
463  auto& operator=(const Range& range)
464  {
465  table.assign(std::begin(range), std::end(range));
466  order_entries();
467  return *this;
468  }
469 
470  auto start_word_breaks() const -> boost::iterator_range<const_iterator>
471  {
472  return {begin(table),
473  begin(table) + start_word_breaks_last_idx};
474  }
475  auto end_word_breaks() const -> boost::iterator_range<const_iterator>
476  {
477  return {begin(table) + start_word_breaks_last_idx,
478  begin(table) + end_word_breaks_last_idx};
479  }
480  auto middle_word_breaks() const -> boost::iterator_range<const_iterator>
481  {
482  return {begin(table) + end_word_breaks_last_idx, end(table)};
483  }
484 };
485 template <class CharT>
487 {
488  auto it = remove_if(begin(table), end(table), [](auto& s) {
489  return s.empty() ||
490  (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
491  });
492  table.erase(it, end(table));
493 
494  auto is_start_word_break = [=](auto& x) { return x[0] == '^'; };
495  auto is_end_word_break = [=](auto& x) { return x.back() == '$'; };
496  auto start_word_breaks_last =
497  partition(begin(table), end(table), is_start_word_break);
498  start_word_breaks_last_idx = start_word_breaks_last - begin(table);
499 
500  for_each(begin(table), start_word_breaks_last,
501  [](auto& e) { e.erase(0, 1); });
502 
503  auto end_word_breaks_last =
504  partition(start_word_breaks_last, end(table), is_end_word_break);
505  end_word_breaks_last_idx = end_word_breaks_last - begin(table);
506 
507  for_each(start_word_breaks_last, end_word_breaks_last,
508  [](auto& e) { e.pop_back(); });
509 }
510 
511 struct identity {
512  template <class T>
513  constexpr auto operator()(T&& t) const noexcept -> T&&
514  {
515  return std::forward<T>(t);
516  }
517 };
518 
519 template <class Value, class Key = Value, class KeyExtract = identity,
520  class Hash = std::hash<Key>, class KeyEqual = std::equal_to<>>
522  private:
523  using bucket_type = boost::container::small_vector<Value, 1>;
524  static constexpr float max_load_fact = 7.0 / 8.0;
525  std::vector<bucket_type> data;
526  size_t sz = 0;
527  size_t max_load_factor_capacity = 0;
528  KeyExtract key_extract = {};
529  Hash hash = {};
530  KeyEqual equal = {};
531 
532  public:
533  using key_type = Key;
534  using value_type = Value;
535  using size_type = std::size_t;
536  using difference_type = std::ptrdiff_t;
537  using hasher = Hash;
538  using reference = value_type&;
539  using const_reference = const value_type&;
540  using pointer = typename bucket_type::pointer;
541  using const_pointer = typename bucket_type::const_pointer;
542  using local_iterator = typename bucket_type::iterator;
543  using local_const_iterator = typename bucket_type::const_iterator;
544 
545  Hash_Multiset() : data(16) {}
546 
547  auto size() const { return sz; }
548  auto empty() const { return size() == 0; }
549 
550  auto rehash(size_t count)
551  {
552  if (empty()) {
553  size_t capacity = 16;
554  while (capacity <= count)
555  capacity <<= 1;
556  data.resize(capacity);
557  max_load_factor_capacity =
558  std::ceil(capacity * max_load_fact);
559  return;
560  }
561  if (count < size() / max_load_fact)
562  count = size() / max_load_fact;
563  auto n = Hash_Multiset();
564  n.rehash(count);
565  for (auto& b : data) {
566  for (auto& x : b) {
567  n.insert(x);
568  }
569  }
570  data.swap(n.data);
571  sz = n.sz;
572  max_load_factor_capacity = n.max_load_factor_capacity;
573  }
574 
575  auto reserve(size_t count) -> void
576  {
577  rehash(std::ceil(count / max_load_fact));
578  }
579 
580  auto insert(const_reference value)
581  {
582  using namespace std;
583  if (sz == max_load_factor_capacity) {
584  reserve(sz + 1);
585  }
586  auto&& key = key_extract(value);
587  auto h = hash(key);
588  auto h_mod = h & (data.size() - 1);
589  auto& bucket = data[h_mod];
590  if (bucket.size() == 0 || bucket.size() == 1 ||
591  equal(key, key_extract(bucket.back()))) {
592  bucket.push_back(value);
593  ++sz;
594  return end(bucket) - 1;
595  }
596  auto last =
597  std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
598  return equal(key, key_extract(x));
599  });
600  if (last != rend(bucket)) {
601  auto ret = bucket.insert(last.base(), value);
602  ++sz;
603  return ret;
604  }
605 
606  bucket.push_back(value);
607  ++sz;
608  return end(bucket) - 1;
609  }
610  template <class... Args>
611  auto emplace(Args&&... a)
612  {
613  return insert(value_type(std::forward<Args>(a)...));
614  }
615 
616  // Note, leaks non-const iterator. do not modify
617  // the key part of the returned value(s).
618  auto equal_range_nonconst_unsafe(const key_type& key)
619  -> std::pair<local_iterator, local_iterator>
620  {
621  using namespace std;
622  if (data.empty())
623  return {};
624  auto h = hash(key);
625  auto h_mod = h & (data.size() - 1);
626  auto& bucket = data[h_mod];
627  if (bucket.empty())
628  return {};
629  if (bucket.size() == 1) {
630  if (equal(key, key_extract(bucket.front())))
631  return {begin(bucket), end(bucket)};
632  return {};
633  }
634  auto first =
635  std::find_if(begin(bucket), end(bucket), [&](auto& x) {
636  return equal(key, key_extract(x));
637  });
638  if (first == end(bucket))
639  return {};
640  auto next = first + 1;
641  if (next == end(bucket) || !equal(key, key_extract(*next)))
642  return {first, next};
643  auto last =
644  std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
645  return equal(key, key_extract(x));
646  });
647  return {first, last.base()};
648  }
649 
650  auto equal_range(const key_type& key) const
651  -> std::pair<local_const_iterator, local_const_iterator>
652  {
653  using namespace std;
654  if (data.empty())
655  return {};
656  auto h = hash(key);
657  auto h_mod = h & (data.size() - 1);
658  auto& bucket = data[h_mod];
659  if (bucket.empty())
660  return {};
661  if (bucket.size() == 1) {
662  if (equal(key, key_extract(bucket.front())))
663  return {begin(bucket), end(bucket)};
664  return {};
665  }
666  auto first =
667  std::find_if(begin(bucket), end(bucket), [&](auto& x) {
668  return equal(key, key_extract(x));
669  });
670  if (first == end(bucket))
671  return {};
672  auto next = first + 1;
673  if (next == end(bucket) || !equal(key, key_extract(*next)))
674  return {first, next};
675  auto last =
676  std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
677  return equal(key, key_extract(x));
678  });
679  return {first, last.base()};
680  }
681 };
682 
689 template <class CharT>
690 class Condition {
691  public:
692  enum Span_Type {
693  NORMAL ,
694  DOT ,
695  ANY_OF ,
696  NONE_OF
697  };
698  using StrT = std::basic_string<CharT>;
699 
700  private:
701  StrT cond;
702  std::vector<std::tuple<size_t, size_t, Span_Type>>
703  spans; // pos, len, type
704  size_t length = 0;
705 
706  auto construct() -> void; // implemented in cxx
707 
708  public:
709  Condition() = default;
710  Condition(const StrT& condition) : cond(condition) { construct(); }
711  Condition(StrT&& condition) : cond(move(condition)) { construct(); }
712  auto match(const StrT& s, size_t pos = 0, size_t len = StrT::npos) const
713  -> bool; // implemented in cxx
714  auto match_prefix(const StrT& s) const { return match(s, 0, length); }
715  auto match_suffix(const StrT& s) const
716  {
717  if (length > s.size())
718  return false;
719  return match(s, s.size() - length, length);
720  }
721 };
722 template <class CharT>
723 auto Condition<CharT>::construct() -> void
724 {
725  size_t i = 0;
726  for (; i != cond.size();) {
727  size_t j = cond.find_first_of(NUSPELL_LITERAL(CharT, "[]."), i);
728  if (i != j) {
729  if (j == cond.npos) {
730  spans.emplace_back(i, cond.size() - i, NORMAL);
731  length += cond.size() - i;
732  break;
733  }
734  spans.emplace_back(i, j - i, NORMAL);
735  length += j - i;
736  i = j;
737  }
738  if (cond[i] == '.') {
739  spans.emplace_back(i, 1, DOT);
740  ++length;
741  ++i;
742  continue;
743  }
744  if (cond[i] == ']') {
745  auto what =
746  "closing bracket has no matching opening bracket";
747  throw std::invalid_argument(what);
748  }
749  if (cond[i] == '[') {
750  ++i;
751  if (i == cond.size()) {
752  auto what = "opening bracket has no matching "
753  "closing bracket";
754  throw std::invalid_argument(what);
755  }
756  Span_Type type;
757  if (cond[i] == '^') {
758  type = NONE_OF;
759  ++i;
760  }
761  else {
762  type = ANY_OF;
763  }
764  j = cond.find(']', i);
765  if (j == i) {
766  auto what = "empty bracket expression";
767  throw std::invalid_argument(what);
768  }
769  if (j == cond.npos) {
770  auto what = "opening bracket has no matching "
771  "closing bracket";
772  throw std::invalid_argument(what);
773  }
774  spans.emplace_back(i, j - i, type);
775  ++length;
776  i = j + 1;
777  }
778  }
779 }
780 
789 template <class CharT>
790 auto Condition<CharT>::match(const StrT& s, size_t pos, size_t len) const
791  -> bool
792 {
793  if (pos > s.size()) {
794  throw std::out_of_range(
795  "position on the string is out of bounds");
796  }
797  if (s.size() - pos < len)
798  len = s.size() - pos;
799  if (len != length)
800  return false;
801 
802  size_t i = pos;
803  for (auto& x : spans) {
804  auto x_pos = std::get<0>(x);
805  auto x_len = std::get<1>(x);
806  auto x_type = std::get<2>(x);
807 
808  using tr = typename StrT::traits_type;
809  switch (x_type) {
810  case NORMAL:
811  if (tr::compare(&s[i], &cond[x_pos], x_len) == 0)
812  i += x_len;
813  else
814  return false;
815  break;
816  case DOT:
817  ++i;
818  break;
819  case ANY_OF:
820  if (tr::find(&cond[x_pos], x_len, s[i]))
821  ++i;
822  else
823  return false;
824  break;
825  case NONE_OF:
826  if (tr::find(&cond[x_pos], x_len, s[i]))
827  return false;
828  else
829  ++i;
830  break;
831  }
832  }
833  return true;
834 }
835 
836 template <class CharT>
837 class Prefix {
838  public:
839  using StrT = std::basic_string<CharT>;
840  using CondT = Condition<CharT>;
841  using value_type = CharT;
842 
843  char16_t flag = 0;
844  bool cross_product = false;
845  StrT stripping;
846  StrT appending;
847  Flag_Set cont_flags;
848  CondT condition;
849 
850  Prefix() = default;
851  Prefix(char16_t flag, bool cross_product, const StrT& strip,
852  const StrT& append, const std::u16string& cont_flags,
853  const StrT& condition)
854  : flag(flag), cross_product(cross_product), stripping(strip),
855  appending(append), cont_flags(cont_flags), condition(condition)
856  {
857  }
858 
859  auto to_root(StrT& word) const -> StrT&
860  {
861  return word.replace(0, appending.size(), stripping);
862  }
863  auto to_root_copy(StrT word) const -> StrT
864  {
865  to_root(word);
866  return word;
867  }
868 
869  auto to_derived(StrT& word) const -> StrT&
870  {
871  return word.replace(0, stripping.size(), appending);
872  }
873  auto to_derived_copy(StrT word) const -> StrT
874  {
875  to_derived(word);
876  return word;
877  }
878 
879  auto check_condition(const StrT& word) const -> bool
880  {
881  return condition.match_prefix(word);
882  }
883 };
884 
885 template <class CharT>
886 class Suffix {
887  public:
888  using StrT = std::basic_string<CharT>;
889  using CondT = Condition<CharT>;
890  using value_type = CharT;
891 
892  char16_t flag = 0;
893  bool cross_product = false;
894  StrT stripping;
895  StrT appending;
896  Flag_Set cont_flags;
897  CondT condition;
898 
899  Suffix() = default;
900  Suffix(char16_t flag, bool cross_product, const StrT& strip,
901  const StrT& append, const std::u16string& cont_flags,
902  const StrT& condition)
903  : flag(flag), cross_product(cross_product), stripping(strip),
904  appending(append), cont_flags(cont_flags), condition(condition)
905  {
906  }
907 
908  auto to_root(StrT& word) const -> StrT&
909  {
910  return word.replace(word.size() - appending.size(),
911  appending.size(), stripping);
912  }
913  auto to_root_copy(StrT word) const -> StrT
914  {
915  to_root(word);
916  return word;
917  }
918 
919  auto to_derived(StrT& word) const -> StrT&
920  {
921  return word.replace(word.size() - stripping.size(),
922  stripping.size(), appending);
923  }
924  auto to_derived_copy(StrT word) const -> StrT
925  {
926  to_derived(word);
927  return word;
928  }
929 
930  auto check_condition(const StrT& word) const -> bool
931  {
932  return condition.match_suffix(word);
933  }
934 };
935 
936 using boost::multi_index::member;
937 
938 template <class CharT, class AffixT>
939 using Affix_Table_Base =
941  member<AffixT, std::basic_string<CharT>, &AffixT::appending>>;
942 
943 template <class CharT, class AffixT>
944 class Affix_Table : private Affix_Table_Base<CharT, AffixT> {
945  private:
946  Flag_Set all_cont_flags;
947 
948  public:
950  using iterator = typename base::local_const_iterator;
951  template <class... Args>
952  auto emplace(Args&&... a)
953  {
954  auto it = base::emplace(std::forward<Args>(a)...);
955  all_cont_flags += it->cont_flags;
956  return it;
957  }
958  auto equal_range(my_string_view<CharT> appending) const
959  {
960  return base::equal_range(appending);
961  }
962  auto has_continuation_flags() const
963  {
964  return all_cont_flags.size() != 0;
965  }
966  auto has_continuation_flag(char16_t flag) const
967  {
968  return all_cont_flags.contains(flag);
969  }
970 };
971 
972 template <class CharT>
974 
975 template <class CharT>
977 
978 template <class CharT>
979 class String_Pair {
980  using StrT = std::basic_string<CharT>;
981  size_t i = 0;
982  StrT s;
983 
984  public:
985  String_Pair() = default;
986  template <class Str1>
997  String_Pair(Str1&& str, size_t i) : i(i), s(std::forward<Str1>(str))
998  {
999  if (i > s.size()) {
1000  throw std::out_of_range("word split is too long");
1001  }
1002  }
1003 
1004  template <
1005  class Str1, class Str2,
1006  class = std::enable_if_t<
1007  std::is_same<std::remove_reference_t<Str1>, StrT>::value &&
1008  std::is_same<std::remove_reference_t<Str2>, StrT>::value>>
1009  String_Pair(Str1&& first, Str2&& second)
1010  : i(first.size()) /* must be init before s, before we move
1011  from first */
1012  ,
1013  s(std::forward<Str1>(first) + std::forward<Str2>(second))
1014 
1015  {
1016  }
1017  auto first() const { return my_string_view<CharT>(s).substr(0, i); }
1018  auto second() const { return my_string_view<CharT>(s).substr(i); }
1019  auto first(my_string_view<CharT> x)
1020  {
1021  s.replace(0, i, x.data(), x.size());
1022  i = x.size();
1023  }
1024  auto second(my_string_view<CharT> x)
1025  {
1026  s.replace(i, s.npos, x.data(), x.size());
1027  }
1028  auto& str() const { return s; }
1029  auto idx() const { return i; }
1030 };
1031 template <class CharT>
1033  using StrT = std::basic_string<CharT>;
1034 
1035  String_Pair<CharT> begin_end_chars;
1036  StrT replacement;
1037  char16_t first_word_flag = 0;
1038  char16_t second_word_flag = 0;
1039  bool match_first_only_unaffixed_or_zero_affixed = false;
1040 };
1041 
1043  std::vector<std::u16string> rules;
1044  Flag_Set all_flags;
1045 
1046  auto fill_all_flags() -> void;
1047 
1048  public:
1049  Compound_Rule_Table() = default;
1050  Compound_Rule_Table(const std::vector<std::u16string>& tbl) : rules(tbl)
1051  {
1052  fill_all_flags();
1053  }
1054  Compound_Rule_Table(std::vector<std::u16string>&& tbl)
1055  : rules(move(tbl))
1056  {
1057  fill_all_flags();
1058  }
1059  auto operator=(const std::vector<std::u16string>& tbl)
1060  {
1061  rules = tbl;
1062  fill_all_flags();
1063  return *this;
1064  }
1065  auto operator=(std::vector<std::u16string>&& tbl)
1066  {
1067  rules = move(tbl);
1068  fill_all_flags();
1069  return *this;
1070  }
1071  auto empty() const { return rules.empty(); }
1072  auto has_any_of_flags(const Flag_Set& f) const -> bool;
1073  auto match_any_rule(const std::vector<const Flag_Set*> data) const
1074  -> bool;
1075 };
1076 auto inline Compound_Rule_Table::fill_all_flags() -> void
1077 {
1078  for (auto& f : rules) {
1079  all_flags += f;
1080  }
1081  all_flags.erase(u'?');
1082  all_flags.erase(u'*');
1083 }
1084 
1085 auto inline Compound_Rule_Table::has_any_of_flags(const Flag_Set& f) const
1086  -> bool
1087 {
1088  using std::begin;
1089  using std::end;
1090  struct Out_Iter_One_Bool {
1091  bool* value = nullptr;
1092  auto& operator++() { return *this; }
1093  auto& operator++(int) { return *this; }
1094  auto& operator*() { return *this; }
1095  auto& operator=(char16_t)
1096  {
1097  *value = true;
1098  return *this;
1099  }
1100  };
1101  auto has_intersection = false;
1102  auto out_it = Out_Iter_One_Bool{&has_intersection};
1103  std::set_intersection(begin(all_flags), end(all_flags), begin(f),
1104  end(f), out_it);
1105  return has_intersection;
1106 }
1107 
1108 auto inline match_compund_rule(const std::vector<const Flag_Set*>& words_data,
1109  const std::u16string& pattern)
1110 {
1111  return match_simple_regex(
1112  words_data, pattern,
1113  [](const Flag_Set* d, char16_t p) { return d->contains(p); });
1114 }
1115 
1116 auto inline Compound_Rule_Table::match_any_rule(
1117  const std::vector<const Flag_Set*> data) const -> bool
1118 {
1119  return any_of(begin(rules), end(rules), [&](const std::u16string& p) {
1120  return match_compund_rule(data, p);
1121  });
1122 }
1123 
1127 template <class CharT>
1129  using VecT = std::vector<std::basic_string<CharT>>;
1130  VecT d;
1131  size_t sz = 0;
1132 
1133  public:
1134  using value_type = typename VecT::value_type;
1135  using allocator_type = typename VecT::allocator_type;
1136  using size_type = typename VecT::size_type;
1137  using difference_type = typename VecT::difference_type;
1138  using reference = typename VecT::reference;
1139  using const_reference = typename VecT::const_reference;
1140  using pointer = typename VecT::pointer;
1141  using const_pointer = typename VecT::const_pointer;
1142  using iterator = typename VecT::iterator;
1143  using const_iterator = typename VecT::const_iterator;
1144  using reverse_iterator = typename VecT::reverse_iterator;
1145  using const_reverse_iterator = typename VecT::const_reverse_iterator;
1146 
1147  List_Basic_Strings() = default;
1148  explicit List_Basic_Strings(size_type n) : d(n), sz(n) {}
1149  List_Basic_Strings(size_type n, const_reference value)
1150  : d(n, value), sz(n)
1151  {
1152  }
1153  template <class InputIterator>
1154  List_Basic_Strings(InputIterator first, InputIterator last)
1155  : d(first, last), sz(d.size())
1156  {
1157  }
1158  List_Basic_Strings(std::initializer_list<value_type> il)
1159  : d(il), sz(d.size())
1160  {
1161  }
1162 
1163  List_Basic_Strings(const List_Basic_Strings& other) = default;
1165  : d(move(other.d)), sz(other.sz)
1166  {
1167  other.sz = 0;
1168  }
1169 
1170  List_Basic_Strings(VecT&& other) : d(other), sz(other.size()) {}
1171 
1172  auto& operator=(const List_Basic_Strings& other)
1173  {
1174  clear();
1175  insert(begin(), other.begin(), other.end());
1176  return *this;
1177  }
1178  auto& operator=(List_Basic_Strings&& other)
1179  {
1180  d = move(other.d);
1181  sz = other.sz;
1182  other.sz = 0;
1183  return *this;
1184  }
1185  auto& operator=(std::initializer_list<value_type> il)
1186  {
1187  clear();
1188  insert(begin(), il);
1189  return *this;
1190  }
1191  template <class InputIterator>
1192  auto assign(InputIterator first, InputIterator last)
1193  {
1194  clear();
1195  insert(begin(), first, last);
1196  }
1197  void assign(size_type n, const_reference value)
1198  {
1199  clear();
1200  insert(begin(), n, value);
1201  }
1202  void assign(std::initializer_list<value_type> il) { *this = il; }
1203  auto get_allocator() const noexcept { return d.get_allocator(); }
1204 
1205  // iterators
1206  auto begin() noexcept -> iterator { return d.begin(); }
1207  auto begin() const noexcept -> const_iterator { return d.begin(); }
1208  auto end() noexcept -> iterator { return begin() + sz; }
1209  auto end() const noexcept -> const_iterator { return begin() + sz; }
1210 
1211  auto rbegin() noexcept { return d.rend() - sz; }
1212  auto rbegin() const noexcept { return d.rend() - sz; }
1213  auto rend() noexcept { return d.rend(); }
1214  auto rend() const noexcept { return d.rend(); }
1215 
1216  auto cbegin() const noexcept { return d.cbegin(); }
1217  auto cend() const noexcept { return cbegin() + sz; }
1218 
1219  auto crbegin() const noexcept { return d.crend() - sz; }
1220  auto crend() const noexcept { return d.crend(); }
1221 
1222  // [vector.capacity], capacity
1223  auto empty() const noexcept { return sz == 0; }
1224  auto size() const noexcept { return sz; }
1225  auto max_size() const noexcept { return d.max_size(); }
1226  auto capacity() const noexcept { return d.size(); }
1227  auto resize(size_type new_sz)
1228  {
1229  if (new_sz <= sz) {
1230  }
1231  else if (new_sz <= d.size()) {
1232  std::for_each(begin() + sz, begin() + new_sz,
1233  [](auto& s) { s.clear(); });
1234  }
1235  else {
1236  std::for_each(d.begin() + sz, d.end(),
1237  [](auto& s) { s.clear(); });
1238  d.resize(new_sz);
1239  }
1240  sz = new_sz;
1241  }
1242  auto resize(size_type new_sz, const_reference c)
1243  {
1244  if (new_sz <= sz) {
1245  }
1246  else if (new_sz <= d.size()) {
1247  std::for_each(begin() + sz, begin() + new_sz,
1248  [&](auto& s) { s = c; });
1249  }
1250  else {
1251  std::for_each(d.begin() + sz, d.end(),
1252  [&](auto& s) { s = c; });
1253  d.resize(new_sz, c);
1254  }
1255  sz = new_sz;
1256  }
1257  void reserve(size_type n)
1258  {
1259  if (n > d.size())
1260  d.resize(n);
1261  }
1262  void shrink_to_fit()
1263  {
1264  d.resize(sz);
1265  d.shrink_to_fit();
1266  for (auto& s : d) {
1267  s.shrink_to_fit();
1268  }
1269  }
1270 
1271  // element access
1272  auto& operator[](size_type n) { return d[n]; }
1273  auto& operator[](size_type n) const { return d[n]; }
1274  auto& at(size_type n)
1275  {
1276  if (n < sz)
1277  return d[n];
1278  else
1279  throw std::out_of_range("index is out of range");
1280  }
1281  auto& at(size_type n) const
1282  {
1283  if (n < sz)
1284  return d[n];
1285  else
1286  throw std::out_of_range("index is out of range");
1287  }
1288  auto& front() { return d.front(); }
1289  auto& front() const { return d.front(); }
1290  auto& back() { return d[sz - 1]; }
1291  auto& back() const { return d[sz - 1]; }
1292 
1293  // [vector.data], data access
1294  auto data() noexcept { return d.data(); }
1295  auto data() const noexcept { return d.data(); }
1296 
1297  // [vector.modifiers], modifiers
1298  template <class... Args>
1299  auto& emplace_back(Args&&... args)
1300  {
1301  if (sz != d.size())
1302  d[sz] = value_type(std::forward<Args>(args)...);
1303  else
1304  d.emplace_back(std::forward<Args>(args)...);
1305  return d[sz++];
1306  }
1307  auto& emplace_back()
1308  {
1309  if (sz != d.size())
1310  d[sz].clear();
1311  else
1312  d.emplace_back();
1313  return d[sz++];
1314  }
1315  auto push_back(const_reference x)
1316  {
1317  if (sz != d.size())
1318  d[sz] = x;
1319  else
1320  d.push_back(x);
1321  ++sz;
1322  }
1323  auto push_back(value_type&& x)
1324  {
1325  if (sz != d.size())
1326  d[sz] = std::move(x);
1327  else
1328  d.push_back(std::move(x));
1329  ++sz;
1330  }
1331  auto pop_back() { --sz; }
1332 
1333  private:
1334  template <class U>
1335  auto insert_priv(const_iterator pos, U&& val)
1336  {
1337  if (sz != d.size()) {
1338  d[sz] = std::forward<U>(val);
1339  }
1340  else {
1341  auto pos_idx = pos - cbegin();
1342  d.push_back(std::forward<U>(val));
1343  pos = cbegin() + pos_idx;
1344  }
1345  auto p = begin() + (pos - cbegin());
1346  std::rotate(p, begin() + sz, begin() + sz + 1);
1347  ++sz;
1348  return p;
1349  }
1350 
1351  public:
1352  template <class... Args>
1353  auto emplace(const_iterator pos, Args&&... args)
1354  {
1355  if (sz != d.size()) {
1356  d[sz] = value_type(std::forward<Args>(args)...);
1357  }
1358  else {
1359  auto pos_idx = pos - cbegin();
1360  d.emplace(std::forward<Args>(args)...);
1361  pos = cbegin() + pos_idx;
1362  }
1363  auto p = begin() + (pos - cbegin());
1364  std::rotate(p, begin() + sz, begin() + sz + 1);
1365  ++sz;
1366  return p;
1367  }
1368  auto insert(const_iterator pos, const_reference x)
1369  {
1370  return insert_priv(pos, x);
1371  }
1372  auto insert(const_iterator pos, value_type&& x)
1373  {
1374  return insert_priv(pos, std::move(x));
1375  }
1376  auto insert(const_iterator pos, size_type n, const_reference x)
1377  -> iterator
1378  {
1379  auto i = sz;
1380  while (n != 0 && i != d.size()) {
1381  d[i] = x;
1382  --n;
1383  ++i;
1384  }
1385  if (n != 0) {
1386  auto pos_idx = pos - cbegin();
1387  d.insert(d.end(), n, x);
1388  pos = cbegin() + pos_idx;
1389  i = d.size();
1390  }
1391  auto p = begin() + (pos - cbegin());
1392  std::rotate(p, begin() + sz, begin() + i);
1393  sz = i;
1394  return p;
1395  }
1396 
1397  template <class InputIterator>
1398  auto insert(const_iterator pos, InputIterator first, InputIterator last)
1399  -> iterator
1400  {
1401  auto i = sz;
1402  while (first != last && i != d.size()) {
1403  d[i] = *first;
1404  ++first;
1405  ++i;
1406  }
1407  if (first != last) {
1408  auto pos_idx = pos - cbegin();
1409  d.insert(d.end(), first, last);
1410  pos = cbegin() + pos_idx;
1411  i = d.size();
1412  }
1413  auto p = begin() + (pos - cbegin());
1414  std::rotate(p, begin() + sz, begin() + i);
1415  sz = i;
1416  return p;
1417  }
1418  auto insert(const_iterator pos, std::initializer_list<value_type> il)
1419  -> iterator
1420  {
1421  return insert(pos, il.begin(), il.end());
1422  }
1423 
1424  auto erase(const_iterator position)
1425  {
1426  auto i0 = begin();
1427  auto i1 = i0 + (position - cbegin());
1428  auto i2 = i1 + 1;
1429  auto i3 = end();
1430  std::rotate(i1, i2, i3);
1431  --sz;
1432  return i1;
1433  }
1434  auto erase(const_iterator first, const_iterator last)
1435  {
1436  auto i0 = begin();
1437  auto i1 = i0 + (first - cbegin());
1438  auto i2 = i0 + (last - cbegin());
1439  auto i3 = end();
1440  std::rotate(i1, i2, i3);
1441  sz -= last - first;
1442  return i1;
1443  }
1444  auto swap(List_Basic_Strings& other)
1445  {
1446  d.swap(other.d);
1447  std::swap(sz, other.sz);
1448  }
1449  auto clear() noexcept -> void { sz = 0; }
1450 
1451  auto operator==(const List_Basic_Strings& other) const
1452  {
1453  return std::equal(begin(), end(), other.begin(), other.end());
1454  }
1455  auto operator!=(const List_Basic_Strings& other) const
1456  {
1457  return !(*this == other);
1458  }
1459  auto operator<(const List_Basic_Strings& other) const
1460  {
1461  return std::lexicographical_compare(begin(), end(),
1462  other.begin(), other.end());
1463  }
1464  auto operator>=(const List_Basic_Strings& other) const
1465  {
1466  return !(*this < other);
1467  }
1468  auto operator>(const List_Basic_Strings& other) const
1469  {
1470  return std::lexicographical_compare(other.begin(), other.end(),
1471  begin(), end());
1472  }
1473  auto operator<=(const List_Basic_Strings& other) const
1474  {
1475  return !(*this > other);
1476  }
1477 
1478  auto extract_sequence() -> VecT
1479  {
1480  d.resize(sz);
1481  sz = 0;
1482  return std::move(d);
1483  }
1484 };
1485 
1486 template <class CharT>
1488 {
1489  a.swap(b);
1490 }
1491 
1494 
1495 template <class CharT>
1497  public:
1498  using StrT = std::basic_string<CharT>;
1499  using Table_Str = std::vector<std::pair<StrT, StrT>>;
1500  using iterator = typename Table_Str::iterator;
1501  using const_iterator = typename Table_Str::const_iterator;
1502 
1503  private:
1504  Table_Str table;
1505  size_t whole_word_reps_last_idx = 0;
1506  size_t start_word_reps_last_idx = 0;
1507  size_t end_word_reps_last_idx = 0;
1508 
1509  auto order_entries() -> void; // implemented in cxx
1510 
1511  public:
1512  Replacement_Table() = default;
1513  Replacement_Table(const Table_Str& v) : table(v) { order_entries(); }
1514  Replacement_Table(Table_Str&& v) : table(move(v)) { order_entries(); }
1515 
1516  auto& operator=(const Table_Str& v)
1517  {
1518  table = v;
1519  order_entries();
1520  return *this;
1521  }
1522 
1523  auto& operator=(Table_Str&& v)
1524  {
1525  table = move(v);
1526  order_entries();
1527  return *this;
1528  }
1529 
1530  template <class Range>
1531  auto& operator=(const Range& range)
1532  {
1533  table.assign(std::begin(range), std::end(range));
1534  order_entries();
1535  return *this;
1536  }
1537 
1538  auto whole_word_replacements() const
1539  -> boost::iterator_range<const_iterator>
1540  {
1541  return {begin(table), begin(table) + whole_word_reps_last_idx};
1542  }
1543  auto start_word_replacements() const
1544  -> boost::iterator_range<const_iterator>
1545  {
1546  return {begin(table) + whole_word_reps_last_idx,
1547  begin(table) + start_word_reps_last_idx};
1548  }
1549  auto end_word_replacements() const
1550  -> boost::iterator_range<const_iterator>
1551  {
1552  return {begin(table) + start_word_reps_last_idx,
1553  begin(table) + end_word_reps_last_idx};
1554  }
1555  auto any_place_replacements() const
1556  -> boost::iterator_range<const_iterator>
1557  {
1558  return {begin(table) + end_word_reps_last_idx, end(table)};
1559  }
1560 };
1561 template <class CharT>
1563 {
1564  auto it = remove_if(begin(table), end(table), [](auto& p) {
1565  auto& s = p.first;
1566  return s.empty() ||
1567  (s.size() == 1 && (s[0] == '^' || s[0] == '$'));
1568  });
1569  table.erase(it, end(table));
1570 
1571  auto is_start_word_pat = [=](auto& x) { return x.first[0] == '^'; };
1572  auto is_end_word_pat = [=](auto& x) { return x.first.back() == '$'; };
1573 
1574  auto start_word_reps_last =
1575  partition(begin(table), end(table), is_start_word_pat);
1576  start_word_reps_last_idx = start_word_reps_last - begin(table);
1577  for_each(begin(table), start_word_reps_last,
1578  [](auto& e) { e.first.erase(0, 1); });
1579 
1580  auto whole_word_reps_last =
1581  partition(begin(table), start_word_reps_last, is_end_word_pat);
1582  whole_word_reps_last_idx = whole_word_reps_last - begin(table);
1583  for_each(begin(table), whole_word_reps_last,
1584  [](auto& e) { e.first.pop_back(); });
1585 
1586  auto end_word_reps_last =
1587  partition(start_word_reps_last, end(table), is_end_word_pat);
1588  end_word_reps_last_idx = end_word_reps_last - begin(table);
1589  for_each(start_word_reps_last, end_word_reps_last,
1590  [](auto& e) { e.first.pop_back(); });
1591 }
1592 
1593 template <class CharT>
1595  using StrT = std::basic_string<CharT>;
1596  StrT chars;
1597  std::vector<StrT> strings;
1598 
1599  auto parse(const StrT& s) -> void;
1600  Similarity_Group() = default;
1601  Similarity_Group(const StrT& s) { parse(s); }
1602  auto& operator=(const StrT& s)
1603  {
1604  parse(s);
1605  return *this;
1606  }
1607 };
1608 template <class CharT>
1609 auto Similarity_Group<CharT>::parse(const StrT& s) -> void
1610 {
1611  auto i = size_t(0);
1612  for (;;) {
1613  auto j = s.find('(', i);
1614  chars.append(s, i, j - i);
1615  if (j == s.npos)
1616  break;
1617  i = j + 1;
1618  j = s.find(')', i);
1619  if (j == s.npos)
1620  break;
1621  auto len = j - i;
1622  if (len == 1)
1623  chars += s[i];
1624  else if (len > 1)
1625  strings.push_back(s.substr(i, len));
1626  i = j + 1;
1627  }
1628 }
1629 
1630 template <class CharT>
1632  using StrT = std::basic_string<CharT>;
1633  using Pair_StrT = std::pair<StrT, StrT>;
1634 
1635  struct Phonet_Match_Result {
1636  size_t count_matched = 0;
1637  size_t go_back_before_replace = 0;
1638  size_t priority = 5;
1639  bool go_back_after_replace = false;
1640  bool treat_next_as_begin = false;
1641  operator bool() { return count_matched; }
1642  };
1643 
1644  std::vector<std::pair<StrT, StrT>> table;
1645  auto order() -> void;
1646  auto static match(const StrT& data, size_t i, const StrT& pattern,
1647  bool at_begin) -> Phonet_Match_Result;
1648 
1649  public:
1650  Phonetic_Table() = default;
1651  Phonetic_Table(const std::vector<Pair_StrT>& v) : table(v) { order(); }
1652  Phonetic_Table(std::vector<Pair_StrT>&& v) : table(move(v)) { order(); }
1653  auto& operator=(const std::vector<Pair_StrT>& v)
1654  {
1655  table = v;
1656  order();
1657  return *this;
1658  }
1659  auto& operator=(std::vector<Pair_StrT>&& v)
1660  {
1661  table = move(v);
1662  order();
1663  return *this;
1664  }
1665  template <class Range>
1666  auto& operator=(const Range& range)
1667  {
1668  table.assign(std::begin(range), std::end(range));
1669  order();
1670  return *this;
1671  }
1672  auto replace(StrT& word) const -> bool;
1673 };
1674 
1675 template <class CharT>
1676 auto Phonetic_Table<CharT>::order() -> void
1677 {
1678  stable_sort(begin(table), end(table), [](auto& pair1, auto& pair2) {
1679  if (pair2.first.empty())
1680  return false;
1681  if (pair1.first.empty())
1682  return true;
1683  return pair1.first[0] < pair2.first[0];
1684  });
1685  auto it = find_if_not(begin(table), end(table),
1686  [](auto& p) { return p.first.empty(); });
1687  table.erase(begin(table), it);
1688  for (auto& r : table) {
1689  if (r.second == NUSPELL_LITERAL(CharT, "_"))
1690  r.second.clear();
1691  }
1692 }
1693 
1694 template <class CharT>
1695 auto Phonetic_Table<CharT>::match(const StrT& data, size_t i,
1696  const StrT& pattern, bool at_begin)
1697  -> Phonet_Match_Result
1698 {
1699  auto ret = Phonet_Match_Result();
1700  auto j =
1701  pattern.find_first_of(NUSPELL_LITERAL(CharT, "(<-0123456789^$"));
1702  if (j == pattern.npos)
1703  j = pattern.size();
1704  if (data.compare(i, j, pattern, 0, j) == 0)
1705  ret.count_matched = j;
1706  else
1707  return {};
1708  if (j == pattern.size())
1709  return ret;
1710  if (pattern[j] == '(') {
1711  auto k = pattern.find(')', j);
1712  if (k == pattern.npos)
1713  return {}; // bad rule
1714  auto x = std::char_traits<CharT>::find(
1715  &pattern[j + 1], k - (j + 1), data[i + j]);
1716  if (!x)
1717  return {};
1718  j = k + 1;
1719  ret.count_matched += 1;
1720  }
1721  if (j == pattern.size())
1722  return ret;
1723  if (pattern[j] == '<') {
1724  ret.go_back_after_replace = true;
1725  ++j;
1726  }
1727  auto k = pattern.find_first_not_of('-', j);
1728  if (k == pattern.npos) {
1729  k = pattern.size();
1730  ret.go_back_before_replace = k - j;
1731  if (ret.go_back_before_replace >= ret.count_matched)
1732  return {}; // bad rule
1733  return ret;
1734  }
1735  else {
1736  ret.go_back_before_replace = k - j;
1737  if (ret.go_back_before_replace >= ret.count_matched)
1738  return {}; // bad rule
1739  }
1740  j = k;
1741  if (pattern[j] >= '0' && pattern[j] <= '9') {
1742  ret.priority = pattern[j] - '0';
1743  ++j;
1744  }
1745  if (j == pattern.size())
1746  return ret;
1747  if (pattern[j] == '^') {
1748  if (!at_begin)
1749  return {};
1750  ++j;
1751  }
1752  if (j == pattern.size())
1753  return ret;
1754  if (pattern[j] == '^') {
1755  ret.treat_next_as_begin = true;
1756  ++j;
1757  }
1758  if (j == pattern.size())
1759  return ret;
1760  if (pattern[j] != '$')
1761  return {}; // bad rule, no other char is allowed at this point
1762  if (i + ret.count_matched == data.size())
1763  return ret;
1764  return {};
1765 }
1766 
1767 template <class CharT>
1768 auto Phonetic_Table<CharT>::replace(StrT& word) const -> bool
1769 {
1770  using boost::make_iterator_range;
1771  struct Cmp {
1772  auto operator()(CharT c, const Pair_StrT& s)
1773  {
1774  return c < s.first[0];
1775  }
1776  auto operator()(const Pair_StrT& s, CharT c)
1777  {
1778  return s.first[0] < c;
1779  }
1780  };
1781  if (table.empty())
1782  return false;
1783  auto ret = false;
1784  auto treat_next_as_begin = true;
1785  size_t count_go_backs_after_replace = 0; // avoid infinite loop
1786  for (size_t i = 0; i != word.size(); ++i) {
1787  auto rules =
1788  equal_range(begin(table), end(table), word[i], Cmp());
1789  for (auto& r : make_iterator_range(rules)) {
1790  auto rule = &r;
1791  auto m1 = match(word, i, r.first, treat_next_as_begin);
1792  if (!m1)
1793  continue;
1794  if (!m1.go_back_before_replace) {
1795  auto j = i + m1.count_matched - 1;
1796  auto rules2 = equal_range(
1797  begin(table), end(table), word[j], Cmp());
1798  for (auto& r2 : make_iterator_range(rules2)) {
1799  auto m2 =
1800  match(word, j, r2.first, false);
1801  if (m2 && m2.priority >= m1.priority) {
1802  i = j;
1803  rule = &r2;
1804  m1 = m2;
1805  break;
1806  }
1807  }
1808  }
1809  word.replace(
1810  i, m1.count_matched - m1.go_back_before_replace,
1811  rule->second);
1812  treat_next_as_begin = m1.treat_next_as_begin;
1813  if (m1.go_back_after_replace &&
1814  count_go_backs_after_replace < 100) {
1815  count_go_backs_after_replace++;
1816  }
1817  else {
1818  i += rule->second.size();
1819  }
1820  --i;
1821  ret = true;
1822  break;
1823  }
1824  }
1825  return ret;
1826 }
1827 } // namespace nuspell
1828 #endif // NUSPELL_STRUCTURES_HXX
Definition: structures.hxx:299
Definition: structures.hxx:1042
Definition: structures.hxx:1631
Definition: structures.hxx:511
auto match(const StrT &s, size_t pos=0, size_t len=StrT::npos) const -> bool
Checks if provided string matched the condition.
Definition: structures.hxx:790
Definition: structures.hxx:1496
String algorithms, private header.
Vector of strings that recycles erased strings.
Definition: structures.hxx:1128
Definition: structures.hxx:521
Definition: main.cxx:516
Definition: structures.hxx:837
String_Pair(Str1 &&str, size_t i)
Construct string pair.
Definition: structures.hxx:997
Library main namespace.
Definition: aff_data.cxx:78
Definition: structures.hxx:429
Definition: structures.hxx:1032
A Set class backed by a string.
Definition: structures.hxx:50
Span_Type
Definition: structures.hxx:692
Definition: structures.hxx:1594
Definition: structures.hxx:886
Definition: structures.hxx:944
Definition: structures.hxx:979
Limited regular expression matching used in affix entries.
Definition: structures.hxx:690