#include <curl/curl.h>
#include <glib.h>
#include <pangomm/init.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/translit.h>
#include <boost/algorithm/string.hpp>
#include <boost/range/algorithm/replace_if.hpp>
#include <boost/thread.hpp>
Safety first and all that.
*/
- wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
+ /* First transliterate using libicu to try to remove accents in a "nice" way */
+ auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
+ auto status = U_ZERO_ERROR;
+ auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
+ transliterator->transliterate(icu_utf16);
+ s.clear ();
+ icu_utf16.toUTF8String(s);
+ /* Then remove anything that's not in a very limited character set */
+ wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
string out;
string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
for (size_t i = 0; i < ws.size(); ++i) {
-
wchar_t c = ws[i];
-
- /* Remove some accents */
- if (wstring(L"áàâ").find(c) != string::npos) {
- c = 'a';
- }
- if (wstring(L"éèêë").find(c) != string::npos) {
- c = 'e';
- }
- if (wstring(L"ö").find(c) != string::npos) {
- c = 'o';
- }
- if (wstring(L"ü").find(c) != string::npos) {
- c = 'u';
- }
-
if (allowed.find(c) != string::npos) {
out += c;
}
BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld"));
BOOST_CHECK_EQUAL ("hello_world", careful_string_filter("héllo_wörld"));
BOOST_CHECK_EQUAL ("hello_world_a", careful_string_filter("héllo_wörld_à"));
+ BOOST_CHECK_EQUAL ("hello_world_CcGgIOoSsUu", careful_string_filter("hello_world_ÇçĞğİÖöŞşÜü"));
}
static list<float> progress_values;