summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2014-05-30 16:17:40 +0100
committerCarl Hetherington <cth@carlh.net>2014-05-30 16:17:40 +0100
commit2a85e711df07e8a707cfc50667bd0a29d8a09519 (patch)
treeb89200eeda155787d91e5cc0ab8044c93b3225cb
parenta4114c49aeec1e24e0607814a88f0f6a2d6111f5 (diff)
Encode to ISO6937 on the way into STL binary.
-rw-r--r--src/iso6937.cc92
-rw-r--r--src/iso6937.h1
-rw-r--r--src/iso6937.py36
-rw-r--r--src/iso6937_tables.cc15
-rw-r--r--src/iso6937_tables.h1
-rw-r--r--src/stl_binary_writer.cc5
-rw-r--r--test/iso6937_test.cc11
7 files changed, 101 insertions, 60 deletions
diff --git a/src/iso6937.cc b/src/iso6937.cc
index 048fd84..47ce458 100644
--- a/src/iso6937.cc
+++ b/src/iso6937.cc
@@ -19,18 +19,23 @@
#include <string>
#include <boost/optional.hpp>
+#include <boost/locale.hpp>
#include "iso6937_tables.h"
#include "iso6937.h"
using std::string;
using std::cout;
+using std::wcout;
using std::wstring;
+using std::map;
+using boost::optional;
+using boost::locale::conv::utf_to_utf;
using namespace sub;
wstring
sub::iso6937_to_utf16 (string s)
{
- if (iso6937::grave.empty ()) {
+ if (iso6937::diacriticals.empty ()) {
make_iso6937_tables ();
}
@@ -44,48 +49,7 @@ sub::iso6937_to_utf16 (string s)
if (u >= 0xc1 && u <= 0xcf) {
diacritical = u;
} else if (diacritical) {
- switch (diacritical.get ()) {
- case 0xC1:
- o += iso6937::grave[u];
- break;
- case 0xC2:
- o += iso6937::acute[u];
- break;
- case 0xC3:
- o += iso6937::circumflex[u];
- break;
- case 0xC4:
- o += iso6937::tilde[u];
- break;
- case 0xC5:
- o += iso6937::macron[u];
- break;
- case 0xC6:
- o += iso6937::breve[u];
- break;
- case 0xC7:
- o += iso6937::dot[u];
- break;
- case 0xC8:
- o += iso6937::diaeresis[u];
- break;
- case 0xCA:
- o += iso6937::ring[u];
- break;
- case 0xCB:
- o += iso6937::cedilla[u];
- break;
- case 0xCD:
- o += iso6937::double_acute[u];
- break;
- case 0xCE:
- o += iso6937::ogonek[u];
- break;
- case 0xCF:
- o += iso6937::caron[u];
- break;
- }
-
+ o += (*iso6937::diacriticals[diacritical.get()])[u];
diacritical.reset ();
} else {
o += iso6937::main[u];
@@ -96,3 +60,45 @@ sub::iso6937_to_utf16 (string s)
return o;
}
+
+static optional<char>
+find (map<char, wchar_t> const & m, wchar_t c)
+{
+ for (map<char, wchar_t>::const_iterator i = m.begin(); i != m.end(); ++i) {
+ if (i->second == c) {
+ return i->first;
+ }
+ }
+
+ return optional<char> ();
+}
+
+string
+sub::utf16_to_iso6937 (wstring s)
+{
+ if (iso6937::diacriticals.empty ()) {
+ make_iso6937_tables ();
+ }
+
+ /* XXX: slow */
+
+ string o;
+ for (size_t i = 0; i < s.size(); ++i) {
+ optional<char> c = find (iso6937::main, s[i]);
+ if (c) {
+ o += c.get ();
+ } else {
+ for (map<char, map<char, wchar_t> *>::const_iterator j = iso6937::diacriticals.begin(); j != iso6937::diacriticals.end(); ++j) {
+ c = find (*(j->second), s[i]);
+ if (c) {
+ o += j->first;
+ o += c.get ();
+ break;
+ }
+ }
+ }
+ }
+
+ return o;
+}
+
diff --git a/src/iso6937.h b/src/iso6937.h
index 7b85edf..d994987 100644
--- a/src/iso6937.h
+++ b/src/iso6937.h
@@ -20,5 +20,6 @@
namespace sub {
extern std::wstring iso6937_to_utf16 (std::string);
+extern std::string utf16_to_iso6937 (std::wstring);
};
diff --git a/src/iso6937.py b/src/iso6937.py
index 4719b07..ecce4dc 100644
--- a/src/iso6937.py
+++ b/src/iso6937.py
@@ -94,26 +94,28 @@ namespace iso6937 {
"""
groups = [
- ('GRAVE', 'grave', 'AEIOUaeiou'),
- ('ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'),
- ('CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'),
- ('TILDE', 'tilde', 'AINOUainou'),
- ('MACRON', 'macron', 'AEIOUaeiou'),
- ('BREVE', 'breve', 'AGUagu'),
- ('DOT ABOVE', 'dot', 'CEGIZcegz'),
- ('DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'),
- ('RING ABOVE', 'ring', 'AUau'),
- ('CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'),
- ('DOUBLE ACUTE', 'double_acute', 'OUou'),
- ('OGONEK', 'ogonek', 'AEIUaeui'),
- ('CARON', 'caron', 'CDELNRSTZcdelnrstz')
+ (0xC1, 'GRAVE', 'grave', 'AEIOUaeiou'),
+ (0xC2, 'ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'),
+ (0xC3, 'CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'),
+ (0xC4, 'TILDE', 'tilde', 'AINOUainou'),
+ (0xC5, 'MACRON', 'macron', 'AEIOUaeiou'),
+ (0xC6, 'BREVE', 'breve', 'AGUagu'),
+ (0xC7, 'DOT ABOVE', 'dot', 'CEGIZcegz'),
+ (0xC8, 'DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'),
+ (0xCA, 'RING ABOVE', 'ring', 'AUau'),
+ (0xCB, 'CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'),
+ (0xCD, 'DOUBLE ACUTE', 'double_acute', 'OUou'),
+ (0xCE, 'OGONEK', 'ogonek', 'AEIUaeui'),
+ (0xCF, 'CARON', 'caron', 'CDELNRSTZcdelnrstz')
]
for g in groups:
- setup(g[1])
+ setup(g[2])
print>>output_c,"map<char, wchar_t> sub::iso6937::main;"
+print>>output_c,"map<char, map<char, wchar_t> *> sub::iso6937::diacriticals;"
print>>output_h,"extern std::map<char, wchar_t> main;"
+print>>output_h,"extern std::map<char, std::map<char, wchar_t> *> diacriticals;"
print>>output_c,"""
void
@@ -123,7 +125,7 @@ sub::make_iso6937_tables ()
"""
for g in groups:
- fill(g[0], g[1], g[2])
+ fill(g[1], g[2], g[3])
print>>output_c,"\tmain[10] = 0x000A;"
@@ -220,6 +222,10 @@ print>>output_c,"\tmain[252] = 0x00FE;"
print>>output_c,"\tmain[253] = 0x0167;"
print>>output_c,"\tmain[254] = 0x014B;"
print>>output_c,"\tmain[255] = 0x00AD;"
+print>>output_c,""
+
+for g in groups:
+ print>>output_c,"\tdiacriticals[%s] = &%s;" % (hex(g[0]), g[2])
print>>output_c,"}"
print>>output_h,""
diff --git a/src/iso6937_tables.cc b/src/iso6937_tables.cc
index 07174c4..b534d4c 100644
--- a/src/iso6937_tables.cc
+++ b/src/iso6937_tables.cc
@@ -38,6 +38,7 @@ map<char, wchar_t> sub::iso6937::double_acute;
map<char, wchar_t> sub::iso6937::ogonek;
map<char, wchar_t> sub::iso6937::caron;
map<char, wchar_t> sub::iso6937::main;
+map<char, map<char, wchar_t> *> sub::iso6937::diacriticals;
void
sub::make_iso6937_tables ()
@@ -393,4 +394,18 @@ sub::make_iso6937_tables ()
main[253] = 0x0167;
main[254] = 0x014B;
main[255] = 0x00AD;
+
+ diacriticals[0xc1] = &grave;
+ diacriticals[0xc2] = &acute;
+ diacriticals[0xc3] = &circumflex;
+ diacriticals[0xc4] = &tilde;
+ diacriticals[0xc5] = &macron;
+ diacriticals[0xc6] = &breve;
+ diacriticals[0xc7] = &dot;
+ diacriticals[0xc8] = &diaeresis;
+ diacriticals[0xca] = &ring;
+ diacriticals[0xcb] = &cedilla;
+ diacriticals[0xcd] = &double_acute;
+ diacriticals[0xce] = &ogonek;
+ diacriticals[0xcf] = &caron;
}
diff --git a/src/iso6937_tables.h b/src/iso6937_tables.h
index 58c8c4c..feb13c4 100644
--- a/src/iso6937_tables.h
+++ b/src/iso6937_tables.h
@@ -28,6 +28,7 @@ extern void make_iso6937_tables ();
namespace iso6937 {
extern std::map<char, wchar_t> main;
+extern std::map<char, std::map<char, wchar_t> *> diacriticals;
extern std::map<char, wchar_t> grave;
extern std::map<char, wchar_t> acute;
extern std::map<char, wchar_t> circumflex;
diff --git a/src/stl_binary_writer.cc b/src/stl_binary_writer.cc
index 334a5bb..f8d2263 100644
--- a/src/stl_binary_writer.cc
+++ b/src/stl_binary_writer.cc
@@ -19,7 +19,9 @@
#include "stl_binary_writer.h"
#include "subtitle.h"
+#include "iso6937.h"
#include "compose.hpp"
+#include <boost/locale.hpp>
#include <list>
#include <cmath>
#include <fstream>
@@ -34,6 +36,7 @@ using std::setw;
using std::setfill;
using std::max;
using std::cout;
+using boost::locale::conv::utf_to_utf;
using namespace sub;
static void
@@ -247,7 +250,7 @@ sub::write_stl_binary (
italic = false;
}
- text += k->text;
+ text += utf16_to_iso6937 (utf_to_utf<wchar_t> (k->text));
}
text += "\x8A";
diff --git a/test/iso6937_test.cc b/test/iso6937_test.cc
index e8563b8..f537230 100644
--- a/test/iso6937_test.cc
+++ b/test/iso6937_test.cc
@@ -24,7 +24,7 @@
using std::cout;
using boost::locale::conv::utf_to_utf;
-BOOST_AUTO_TEST_CASE (iso6937_test)
+BOOST_AUTO_TEST_CASE (iso6937_to_utf16_test)
{
BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Hello world")), "Hello world");
BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Testing \xA9testing\xB9")), "Testing ‘testing’");
@@ -32,3 +32,12 @@ BOOST_AUTO_TEST_CASE (iso6937_test)
BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("M\xC8otorhead")), "Mötorhead");
BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough");
}
+
+BOOST_AUTO_TEST_CASE (utf16_to_iso6937_test)
+{
+ BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Hello world")), "Hello world");
+ BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Testing ‘testing’")), "Testing \xA9testing\xB9");
+ BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("All must have çedillas")), "All must have \xCB""cedillas");
+ BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Mötorhead")), "M\xC8otorhead");
+ BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough");
+}