From 1a721b82d4094c00ee89574e17c58c23c0de8cdd Mon Sep 17 00:00:00 2001 From: Carl Hetherington Date: Wed, 11 May 2022 20:28:56 +0200 Subject: [PATCH] Tidy up careful_string_filter and add some extra transliterations. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit ł seems to be ignored by the ICU transliterator for some reason. --- src/lib/util.cc | 31 ++++++++++++++++++++++--------- test/util_test.cc | 2 +- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/lib/util.cc b/src/lib/util.cc index d020ba13d..7d0d2bf60 100644 --- a/src/lib/util.cc +++ b/src/lib/util.cc @@ -767,19 +767,32 @@ careful_string_filter (string s) */ /* First transliterate using libicu to try to remove accents in a "nice" way */ - auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s)); + auto transliterated = icu::UnicodeString::fromUTF8(icu::StringPiece(s)); auto status = U_ZERO_ERROR; auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status); - transliterator->transliterate(icu_utf16); - s.clear (); - icu_utf16.toUTF8String(s); + transliterator->transliterate(transliterated); + + /* Some things are missed by ICU's transliterator */ + std::map replacements = { + { L'ł', L'l' }, + { L'Ł', L'L' } + }; + + icu::UnicodeString transliterated_more; + for (int i = 0; i < transliterated.length(); ++i) { + auto replacement = replacements.find(transliterated[i]); + if (replacement != replacements.end()) { + transliterated_more += replacement->second; + } else { + transliterated_more += transliterated[i]; + } + } /* Then remove anything that's not in a very limited character set */ - wstring ws = boost::locale::conv::utf_to_utf(s); - string out; - string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+"; - for (size_t i = 0; i < ws.size(); ++i) { - wchar_t c = ws[i]; + wstring out; + wstring const allowed = L"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+"; + for (size_t i = 0; i < transliterated_more.length(); ++i) { + wchar_t c = transliterated_more[i]; if (allowed.find(c) != string::npos) { out += c; } diff --git a/test/util_test.cc b/test/util_test.cc index b0abe04c9..a45c144b1 100644 --- a/test/util_test.cc +++ b/test/util_test.cc @@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE (careful_string_filter_test) BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld")); BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld")); BOOST_CHECK_EQUAL ("hello_world_a", careful_string_filter("héllo_wörld_à")); - BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUu", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜü")); + BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUuLl", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜüŁł")); } -- 2.30.2