From b81f1396528f01bead8c1faf128a277820980025 Mon Sep 17 00:00:00 2001 From: Carl Hetherington Date: Wed, 3 Feb 2021 22:02:18 +0100 Subject: [PATCH] Use icu to remove diacritics from strings. This replaces some ad-hoc code and extends it to work with more characters (#1904). --- src/lib/util.cc | 29 ++++++++++++----------------- test/util_test.cc | 1 + 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/lib/util.cc b/src/lib/util.cc index 2a14f00c7..2c0167829 100644 --- a/src/lib/util.cc +++ b/src/lib/util.cc @@ -62,6 +62,9 @@ extern "C" { #include #include #include +#include +#include +#include #include #include #include @@ -771,28 +774,20 @@ careful_string_filter (string s) Safety first and all that. */ - wstring ws = boost::locale::conv::utf_to_utf(s); + /* First transliterate using libicu to try to remove accents in a "nice" way */ + auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s)); + auto status = U_ZERO_ERROR; + auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status); + transliterator->transliterate(icu_utf16); + s.clear (); + icu_utf16.toUTF8String(s); + /* Then remove anything that's not in a very limited character set */ + wstring ws = boost::locale::conv::utf_to_utf(s); string out; string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+"; for (size_t i = 0; i < ws.size(); ++i) { - wchar_t c = ws[i]; - - /* Remove some accents */ - if (wstring(L"áàâ").find(c) != string::npos) { - c = 'a'; - } - if (wstring(L"éèêë").find(c) != string::npos) { - c = 'e'; - } - if (wstring(L"ö").find(c) != string::npos) { - c = 'o'; - } - if (wstring(L"ü").find(c) != string::npos) { - c = 'u'; - } - if (allowed.find(c) != string::npos) { out += c; } diff --git a/test/util_test.cc b/test/util_test.cc index 56de057a6..a46337ab7 100644 --- a/test/util_test.cc +++ b/test/util_test.cc @@ -113,6 +113,7 @@ BOOST_AUTO_TEST_CASE (careful_string_filter_test) BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld")); BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld")); BOOST_CHECK_EQUAL ("hello_world_a", careful_string_filter("héllo_wörld_à")); + BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUu", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜü")); } static list progress_values; -- 2.30.2