Use icu to remove diacritics from strings.
authorCarl Hetherington <cth@carlh.net>
Wed, 3 Feb 2021 21:02:18 +0000 (22:02 +0100)
committerCarl Hetherington <cth@carlh.net>
Wed, 3 Feb 2021 21:02:18 +0000 (22:02 +0100)
This replaces some ad-hoc code and extends it to work with more
characters (#1904).

src/lib/util.cc
test/util_test.cc

index 2a14f00c78aada548ac010dfb21a9197926de5e1..2c01678296f489c8038c39160f83ebba6dfc7baf 100644 (file)
@@ -62,6 +62,9 @@ extern "C" {
 #include <curl/curl.h>
 #include <glib.h>
 #include <pangomm/init.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/translit.h>
 #include <boost/algorithm/string.hpp>
 #include <boost/range/algorithm/replace_if.hpp>
 #include <boost/thread.hpp>
@@ -771,28 +774,20 @@ careful_string_filter (string s)
           Safety first and all that.
        */
 
-       wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
+       /* First transliterate using libicu to try to remove accents in a "nice" way */
+       auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
+       auto status = U_ZERO_ERROR;
+       auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
+       transliterator->transliterate(icu_utf16);
+       s.clear ();
+       icu_utf16.toUTF8String(s);
 
+       /* Then remove anything that's not in a very limited character set */
+       wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
        string out;
        string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
        for (size_t i = 0; i < ws.size(); ++i) {
-
                wchar_t c = ws[i];
-
-               /* Remove some accents */
-               if (wstring(L"áàâ").find(c) != string::npos) {
-                       c = 'a';
-               }
-               if (wstring(L"éèêë").find(c) != string::npos) {
-                       c = 'e';
-               }
-               if (wstring(L"ö").find(c) != string::npos) {
-                       c = 'o';
-               }
-               if (wstring(L"ü").find(c) != string::npos) {
-                       c = 'u';
-               }
-
                if (allowed.find(c) != string::npos) {
                        out += c;
                }
index 56de057a68bc76997e5bb4a6a1289ecc480ed023..a46337ab7366190835f530892d893ad2b486f72c 100644 (file)
@@ -113,6 +113,7 @@ BOOST_AUTO_TEST_CASE (careful_string_filter_test)
        BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld"));
        BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld"));
        BOOST_CHECK_EQUAL ("hello_world_a", careful_string_filter("héllo_wörld_à"));
+       BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUu", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜü"));
 }
 
 static list<float> progress_values;