Use icu to remove diacritics from strings.
[dcpomatic.git] / src / lib / util.cc
index 2a14f00c78aada548ac010dfb21a9197926de5e1..2c01678296f489c8038c39160f83ebba6dfc7baf 100644 (file)
@@ -62,6 +62,9 @@ extern "C" {
 #include <curl/curl.h>
 #include <glib.h>
 #include <pangomm/init.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/translit.h>
 #include <boost/algorithm/string.hpp>
 #include <boost/range/algorithm/replace_if.hpp>
 #include <boost/thread.hpp>
@@ -771,28 +774,20 @@ careful_string_filter (string s)
           Safety first and all that.
        */
 
-       wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
+       /* First transliterate using libicu to try to remove accents in a "nice" way */
+       auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
+       auto status = U_ZERO_ERROR;
+       auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
+       transliterator->transliterate(icu_utf16);
+       s.clear ();
+       icu_utf16.toUTF8String(s);
 
+       /* Then remove anything that's not in a very limited character set */
+       wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
        string out;
        string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
        for (size_t i = 0; i < ws.size(); ++i) {
-
                wchar_t c = ws[i];
-
-               /* Remove some accents */
-               if (wstring(L"áàâ").find(c) != string::npos) {
-                       c = 'a';
-               }
-               if (wstring(L"éèêë").find(c) != string::npos) {
-                       c = 'e';
-               }
-               if (wstring(L"ö").find(c) != string::npos) {
-                       c = 'o';
-               }
-               if (wstring(L"ü").find(c) != string::npos) {
-                       c = 'u';
-               }
-
                if (allowed.find(c) != string::npos) {
                        out += c;
                }