diff options
| author | Carl Hetherington <cth@carlh.net> | 2021-02-03 22:02:18 +0100 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2021-02-03 22:02:18 +0100 |
| commit | b81f1396528f01bead8c1faf128a277820980025 (patch) | |
| tree | c61a2ef4936ecc3b2c83430a3a334884488a79fe /src/lib | |
| parent | 369821c41e62d4cce506cd4206f9db0d91b4f643 (diff) | |
Use icu to remove diacritics from strings.
This replaces some ad-hoc code and extends it to work with more
characters (#1904).
Diffstat (limited to 'src/lib')
| -rw-r--r-- | src/lib/util.cc | 29 |
1 files changed, 12 insertions, 17 deletions
diff --git a/src/lib/util.cc b/src/lib/util.cc index 2a14f00c7..2c0167829 100644 --- a/src/lib/util.cc +++ b/src/lib/util.cc @@ -62,6 +62,9 @@ extern "C" { #include <curl/curl.h> #include <glib.h> #include <pangomm/init.h> +#include <unicode/utypes.h> +#include <unicode/unistr.h> +#include <unicode/translit.h> #include <boost/algorithm/string.hpp> #include <boost/range/algorithm/replace_if.hpp> #include <boost/thread.hpp> @@ -771,28 +774,20 @@ careful_string_filter (string s) Safety first and all that. */ - wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s); + /* First transliterate using libicu to try to remove accents in a "nice" way */ + auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s)); + auto status = U_ZERO_ERROR; + auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status); + transliterator->transliterate(icu_utf16); + s.clear (); + icu_utf16.toUTF8String(s); + /* Then remove anything that's not in a very limited character set */ + wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s); string out; string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+"; for (size_t i = 0; i < ws.size(); ++i) { - wchar_t c = ws[i]; - - /* Remove some accents */ - if (wstring(L"áàâ").find(c) != string::npos) { - c = 'a'; - } - if (wstring(L"éèêë").find(c) != string::npos) { - c = 'e'; - } - if (wstring(L"ö").find(c) != string::npos) { - c = 'o'; - } - if (wstring(L"ü").find(c) != string::npos) { - c = 'u'; - } - if (allowed.find(c) != string::npos) { out += c; } |
