Tidy up careful_string_filter and add some extra transliterations.
authorCarl Hetherington <cth@carlh.net>
Wed, 11 May 2022 18:28:56 +0000 (20:28 +0200)
committerCarl Hetherington <cth@carlh.net>
Wed, 11 May 2022 18:38:01 +0000 (20:38 +0200)
ł seems to be ignored by the ICU transliterator for some reason.

src/lib/util.cc
test/util_test.cc

index d020ba13d45250c00b009c251aaa95144a6cb607..7d0d2bf606c698d3a2a4881f2336c4e156456899 100644 (file)
@@ -767,19 +767,32 @@ careful_string_filter (string s)
        */
 
        /* First transliterate using libicu to try to remove accents in a "nice" way */
-       auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
+       auto transliterated = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
        auto status = U_ZERO_ERROR;
        auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
-       transliterator->transliterate(icu_utf16);
-       s.clear ();
-       icu_utf16.toUTF8String(s);
+       transliterator->transliterate(transliterated);
+
+       /* Some things are missed by ICU's transliterator */
+       std::map<wchar_t, wchar_t> replacements = {
+               { L'ł',         L'l' },
+               { L'Ł',         L'L' }
+       };
+
+       icu::UnicodeString transliterated_more;
+       for (int i = 0; i < transliterated.length(); ++i) {
+               auto replacement = replacements.find(transliterated[i]);
+               if (replacement != replacements.end()) {
+                       transliterated_more += replacement->second;
+               } else {
+                       transliterated_more += transliterated[i];
+               }
+       }
 
        /* Then remove anything that's not in a very limited character set */
-       wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
-       string out;
-       string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
-       for (size_t i = 0; i < ws.size(); ++i) {
-               wchar_t c = ws[i];
+       wstring out;
+       wstring const allowed = L"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
+       for (size_t i = 0; i < transliterated_more.length(); ++i) {
+               wchar_t c = transliterated_more[i];
                if (allowed.find(c) != string::npos) {
                        out += c;
                }
index b0abe04c9f47495261ede1985640aca0aedeb44c..a45c144b1c5056fe48e466b36ba1d04b6f89aff4 100644 (file)
@@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE (careful_string_filter_test)
        BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld"));
        BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld"));
        BOOST_CHECK_EQUAL ("hello_world_a", careful_string_filter("héllo_wörld_à"));
-       BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUu", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜü"));
+       BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUuLl", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜüŁł"));
 }