summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2021-02-03 22:02:18 +0100
committerCarl Hetherington <cth@carlh.net>2021-02-03 22:02:18 +0100
commitb81f1396528f01bead8c1faf128a277820980025 (patch)
treec61a2ef4936ecc3b2c83430a3a334884488a79fe /src/lib
parent369821c41e62d4cce506cd4206f9db0d91b4f643 (diff)
Use icu to remove diacritics from strings.
This replaces some ad-hoc code and extends it to work with more characters (#1904).
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/util.cc29
1 files changed, 12 insertions, 17 deletions
diff --git a/src/lib/util.cc b/src/lib/util.cc
index 2a14f00c7..2c0167829 100644
--- a/src/lib/util.cc
+++ b/src/lib/util.cc
@@ -62,6 +62,9 @@ extern "C" {
#include <curl/curl.h>
#include <glib.h>
#include <pangomm/init.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/translit.h>
#include <boost/algorithm/string.hpp>
#include <boost/range/algorithm/replace_if.hpp>
#include <boost/thread.hpp>
@@ -771,28 +774,20 @@ careful_string_filter (string s)
Safety first and all that.
*/
- wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
+ /* First transliterate using libicu to try to remove accents in a "nice" way */
+ auto icu_utf16 = icu::UnicodeString::fromUTF8(icu::StringPiece(s));
+ auto status = U_ZERO_ERROR;
+ auto transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status);
+ transliterator->transliterate(icu_utf16);
+ s.clear ();
+ icu_utf16.toUTF8String(s);
+ /* Then remove anything that's not in a very limited character set */
+ wstring ws = boost::locale::conv::utf_to_utf<wchar_t>(s);
string out;
string const allowed = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_%.+";
for (size_t i = 0; i < ws.size(); ++i) {
-
wchar_t c = ws[i];
-
- /* Remove some accents */
- if (wstring(L"áàâ").find(c) != string::npos) {
- c = 'a';
- }
- if (wstring(L"éèêë").find(c) != string::npos) {
- c = 'e';
- }
- if (wstring(L"ö").find(c) != string::npos) {
- c = 'o';
- }
- if (wstring(L"ü").find(c) != string::npos) {
- c = 'u';
- }
-
if (allowed.find(c) != string::npos) {
out += c;
}