diff options
| author | Carl Hetherington <cth@carlh.net> | 2014-05-30 16:17:40 +0100 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2014-05-30 16:17:40 +0100 |
| commit | 2a85e711df07e8a707cfc50667bd0a29d8a09519 (patch) | |
| tree | b89200eeda155787d91e5cc0ab8044c93b3225cb | |
| parent | a4114c49aeec1e24e0607814a88f0f6a2d6111f5 (diff) | |
Encode to ISO6937 on the way into STL binary.
| -rw-r--r-- | src/iso6937.cc | 92 | ||||
| -rw-r--r-- | src/iso6937.h | 1 | ||||
| -rw-r--r-- | src/iso6937.py | 36 | ||||
| -rw-r--r-- | src/iso6937_tables.cc | 15 | ||||
| -rw-r--r-- | src/iso6937_tables.h | 1 | ||||
| -rw-r--r-- | src/stl_binary_writer.cc | 5 | ||||
| -rw-r--r-- | test/iso6937_test.cc | 11 |
7 files changed, 101 insertions, 60 deletions
diff --git a/src/iso6937.cc b/src/iso6937.cc index 048fd84..47ce458 100644 --- a/src/iso6937.cc +++ b/src/iso6937.cc @@ -19,18 +19,23 @@ #include <string> #include <boost/optional.hpp> +#include <boost/locale.hpp> #include "iso6937_tables.h" #include "iso6937.h" using std::string; using std::cout; +using std::wcout; using std::wstring; +using std::map; +using boost::optional; +using boost::locale::conv::utf_to_utf; using namespace sub; wstring sub::iso6937_to_utf16 (string s) { - if (iso6937::grave.empty ()) { + if (iso6937::diacriticals.empty ()) { make_iso6937_tables (); } @@ -44,48 +49,7 @@ sub::iso6937_to_utf16 (string s) if (u >= 0xc1 && u <= 0xcf) { diacritical = u; } else if (diacritical) { - switch (diacritical.get ()) { - case 0xC1: - o += iso6937::grave[u]; - break; - case 0xC2: - o += iso6937::acute[u]; - break; - case 0xC3: - o += iso6937::circumflex[u]; - break; - case 0xC4: - o += iso6937::tilde[u]; - break; - case 0xC5: - o += iso6937::macron[u]; - break; - case 0xC6: - o += iso6937::breve[u]; - break; - case 0xC7: - o += iso6937::dot[u]; - break; - case 0xC8: - o += iso6937::diaeresis[u]; - break; - case 0xCA: - o += iso6937::ring[u]; - break; - case 0xCB: - o += iso6937::cedilla[u]; - break; - case 0xCD: - o += iso6937::double_acute[u]; - break; - case 0xCE: - o += iso6937::ogonek[u]; - break; - case 0xCF: - o += iso6937::caron[u]; - break; - } - + o += (*iso6937::diacriticals[diacritical.get()])[u]; diacritical.reset (); } else { o += iso6937::main[u]; @@ -96,3 +60,45 @@ sub::iso6937_to_utf16 (string s) return o; } + +static optional<char> +find (map<char, wchar_t> const & m, wchar_t c) +{ + for (map<char, wchar_t>::const_iterator i = m.begin(); i != m.end(); ++i) { + if (i->second == c) { + return i->first; + } + } + + return optional<char> (); +} + +string +sub::utf16_to_iso6937 (wstring s) +{ + if (iso6937::diacriticals.empty ()) { + make_iso6937_tables (); + } + + /* XXX: slow */ + + string o; + for (size_t i = 0; i < s.size(); ++i) { + optional<char> c = find (iso6937::main, s[i]); + if (c) { + o += c.get (); + } else { + for (map<char, map<char, wchar_t> *>::const_iterator j = iso6937::diacriticals.begin(); j != iso6937::diacriticals.end(); ++j) { + c = find (*(j->second), s[i]); + if (c) { + o += j->first; + o += c.get (); + break; + } + } + } + } + + return o; +} + diff --git a/src/iso6937.h b/src/iso6937.h index 7b85edf..d994987 100644 --- a/src/iso6937.h +++ b/src/iso6937.h @@ -20,5 +20,6 @@ namespace sub { extern std::wstring iso6937_to_utf16 (std::string); +extern std::string utf16_to_iso6937 (std::wstring); }; diff --git a/src/iso6937.py b/src/iso6937.py index 4719b07..ecce4dc 100644 --- a/src/iso6937.py +++ b/src/iso6937.py @@ -94,26 +94,28 @@ namespace iso6937 { """ groups = [ - ('GRAVE', 'grave', 'AEIOUaeiou'), - ('ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'), - ('CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'), - ('TILDE', 'tilde', 'AINOUainou'), - ('MACRON', 'macron', 'AEIOUaeiou'), - ('BREVE', 'breve', 'AGUagu'), - ('DOT ABOVE', 'dot', 'CEGIZcegz'), - ('DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'), - ('RING ABOVE', 'ring', 'AUau'), - ('CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'), - ('DOUBLE ACUTE', 'double_acute', 'OUou'), - ('OGONEK', 'ogonek', 'AEIUaeui'), - ('CARON', 'caron', 'CDELNRSTZcdelnrstz') + (0xC1, 'GRAVE', 'grave', 'AEIOUaeiou'), + (0xC2, 'ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'), + (0xC3, 'CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'), + (0xC4, 'TILDE', 'tilde', 'AINOUainou'), + (0xC5, 'MACRON', 'macron', 'AEIOUaeiou'), + (0xC6, 'BREVE', 'breve', 'AGUagu'), + (0xC7, 'DOT ABOVE', 'dot', 'CEGIZcegz'), + (0xC8, 'DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'), + (0xCA, 'RING ABOVE', 'ring', 'AUau'), + (0xCB, 'CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'), + (0xCD, 'DOUBLE ACUTE', 'double_acute', 'OUou'), + (0xCE, 'OGONEK', 'ogonek', 'AEIUaeui'), + (0xCF, 'CARON', 'caron', 'CDELNRSTZcdelnrstz') ] for g in groups: - setup(g[1]) + setup(g[2]) print>>output_c,"map<char, wchar_t> sub::iso6937::main;" +print>>output_c,"map<char, map<char, wchar_t> *> sub::iso6937::diacriticals;" print>>output_h,"extern std::map<char, wchar_t> main;" +print>>output_h,"extern std::map<char, std::map<char, wchar_t> *> diacriticals;" print>>output_c,""" void @@ -123,7 +125,7 @@ sub::make_iso6937_tables () """ for g in groups: - fill(g[0], g[1], g[2]) + fill(g[1], g[2], g[3]) print>>output_c,"\tmain[10] = 0x000A;" @@ -220,6 +222,10 @@ print>>output_c,"\tmain[252] = 0x00FE;" print>>output_c,"\tmain[253] = 0x0167;" print>>output_c,"\tmain[254] = 0x014B;" print>>output_c,"\tmain[255] = 0x00AD;" +print>>output_c,"" + +for g in groups: + print>>output_c,"\tdiacriticals[%s] = &%s;" % (hex(g[0]), g[2]) print>>output_c,"}" print>>output_h,"" diff --git a/src/iso6937_tables.cc b/src/iso6937_tables.cc index 07174c4..b534d4c 100644 --- a/src/iso6937_tables.cc +++ b/src/iso6937_tables.cc @@ -38,6 +38,7 @@ map<char, wchar_t> sub::iso6937::double_acute; map<char, wchar_t> sub::iso6937::ogonek; map<char, wchar_t> sub::iso6937::caron; map<char, wchar_t> sub::iso6937::main; +map<char, map<char, wchar_t> *> sub::iso6937::diacriticals; void sub::make_iso6937_tables () @@ -393,4 +394,18 @@ sub::make_iso6937_tables () main[253] = 0x0167; main[254] = 0x014B; main[255] = 0x00AD; + + diacriticals[0xc1] = ` + diacriticals[0xc2] = ´ + diacriticals[0xc3] = &circumflex; + diacriticals[0xc4] = ˜ + diacriticals[0xc5] = ¯on; + diacriticals[0xc6] = ˘ + diacriticals[0xc7] = ˙ + diacriticals[0xc8] = &diaeresis; + diacriticals[0xca] = ˚ + diacriticals[0xcb] = ¸la; + diacriticals[0xcd] = &double_acute; + diacriticals[0xce] = &ogonek; + diacriticals[0xcf] = ˇ } diff --git a/src/iso6937_tables.h b/src/iso6937_tables.h index 58c8c4c..feb13c4 100644 --- a/src/iso6937_tables.h +++ b/src/iso6937_tables.h @@ -28,6 +28,7 @@ extern void make_iso6937_tables (); namespace iso6937 { extern std::map<char, wchar_t> main; +extern std::map<char, std::map<char, wchar_t> *> diacriticals; extern std::map<char, wchar_t> grave; extern std::map<char, wchar_t> acute; extern std::map<char, wchar_t> circumflex; diff --git a/src/stl_binary_writer.cc b/src/stl_binary_writer.cc index 334a5bb..f8d2263 100644 --- a/src/stl_binary_writer.cc +++ b/src/stl_binary_writer.cc @@ -19,7 +19,9 @@ #include "stl_binary_writer.h" #include "subtitle.h" +#include "iso6937.h" #include "compose.hpp" +#include <boost/locale.hpp> #include <list> #include <cmath> #include <fstream> @@ -34,6 +36,7 @@ using std::setw; using std::setfill; using std::max; using std::cout; +using boost::locale::conv::utf_to_utf; using namespace sub; static void @@ -247,7 +250,7 @@ sub::write_stl_binary ( italic = false; } - text += k->text; + text += utf16_to_iso6937 (utf_to_utf<wchar_t> (k->text)); } text += "\x8A"; diff --git a/test/iso6937_test.cc b/test/iso6937_test.cc index e8563b8..f537230 100644 --- a/test/iso6937_test.cc +++ b/test/iso6937_test.cc @@ -24,7 +24,7 @@ using std::cout; using boost::locale::conv::utf_to_utf; -BOOST_AUTO_TEST_CASE (iso6937_test) +BOOST_AUTO_TEST_CASE (iso6937_to_utf16_test) { BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Hello world")), "Hello world"); BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Testing \xA9testing\xB9")), "Testing ‘testing’"); @@ -32,3 +32,12 @@ BOOST_AUTO_TEST_CASE (iso6937_test) BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("M\xC8otorhead")), "Mötorhead"); BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough"); } + +BOOST_AUTO_TEST_CASE (utf16_to_iso6937_test) +{ + BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Hello world")), "Hello world"); + BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Testing ‘testing’")), "Testing \xA9testing\xB9"); + BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("All must have çedillas")), "All must have \xCB""cedillas"); + BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Mötorhead")), "M\xC8otorhead"); + BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough"); +} |
