*/
/* First transliterate using libicu to try to remove accents in a "nice" way */
- auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
+ auto transliterated = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
auto status = U_ZERO_ERROR;
auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
- transliterator->transliterate(icu_utf16);
- s.clear ();
- icu_utf16.toUTF8String(s);
+ transliterator->transliterate(transliterated);
+
+ /* Some things are missed by ICU's transliterator */
+ std::map<wchar_t, wchar_t> replacements = {
+ { L'ł', L'l' },
+ { L'Ł', L'L' }
+ };
+
+ icu::UnicodeString transliterated_more;
+ for (int i = 0; i < transliterated.length(); ++i) {
+ auto replacement = replacements.find(transliterated[i]);
+ if (replacement != replacements.end()) {
+ transliterated_more += replacement->second;
+ } else {
+ transliterated_more += transliterated[i];
+ }
+ }
/* Then remove anything that's not in a very limited character set */
- wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
- string out;
- string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
- for (size_t i = 0; i < ws.size(); ++i) {
- wchar_t c = ws[i];
+ wstring out;
+ wstring const allowed = L"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
+ for (size_t i = 0; i < transliterated_more.length(); ++i) {
+ wchar_t c = transliterated_more[i];
if (allowed.find(c) != string::npos) {
out += c;
}