Encode to ISO6937 on the way into STL binary.

author: Carl Hetherington <cth@carlh.net> 2014-05-30 16:17:40 +0100
committer: Carl Hetherington <cth@carlh.net> 2014-05-30 16:17:40 +0100
commit: 2a85e711df07e8a707cfc50667bd0a29d8a09519 (patch)
tree: b89200eeda155787d91e5cc0ab8044c93b3225cb
parent: a4114c49aeec1e24e0607814a88f0f6a2d6111f5 (diff)
7 files changed, 101 insertions, 60 deletions
diff --git a/src/iso6937.cc b/src/iso6937.cc
index 048fd84..47ce458 100644
--- a/src/iso6937.cc
+++ b/src/iso6937.cc
@@ -19,18 +19,23 @@
 
 #include <string>
 #include <boost/optional.hpp>
+#include <boost/locale.hpp>
 #include "iso6937_tables.h"
 #include "iso6937.h"
 
 using std::string;
 using std::cout;
+using std::wcout;
 using std::wstring;
+using std::map;
+using boost::optional;
+using boost::locale::conv::utf_to_utf;
 using namespace sub;
 
 wstring
 sub::iso6937_to_utf16 (string s)
 {
-	if (iso6937::grave.empty ()) {
+	if (iso6937::diacriticals.empty ()) {
 		make_iso6937_tables ();
 	}
 	
@@ -44,48 +49,7 @@ sub::iso6937_to_utf16 (string s)
 		if (u >= 0xc1 && u <= 0xcf) {
 			diacritical = u;
 		} else if (diacritical) {
-			switch (diacritical.get ()) {
-			case 0xC1:
-				o += iso6937::grave[u];
-				break;
-			case 0xC2:
-				o += iso6937::acute[u];
-				break;
-			case 0xC3:
-				o += iso6937::circumflex[u];
-				break;
-			case 0xC4:
-				o += iso6937::tilde[u];
-				break;
-			case 0xC5:
-				o += iso6937::macron[u];
-				break;
-			case 0xC6:
-				o += iso6937::breve[u];
-				break;
-			case 0xC7:
-				o += iso6937::dot[u];
-				break;
-			case 0xC8:
-				o += iso6937::diaeresis[u];
-				break;
-			case 0xCA:
-				o += iso6937::ring[u];
-				break;
-			case 0xCB:
-				o += iso6937::cedilla[u];
-				break;
-			case 0xCD:
-				o += iso6937::double_acute[u];
-				break;
-			case 0xCE:
-				o += iso6937::ogonek[u];
-				break;
-			case 0xCF:
-				o += iso6937::caron[u];
-				break;
-			}
-
+			o += (*iso6937::diacriticals[diacritical.get()])[u];
 			diacritical.reset ();
 		} else {
 			o += iso6937::main[u];
@@ -96,3 +60,45 @@ sub::iso6937_to_utf16 (string s)
 
 	return o;
 }
+
+static optional<char>
+find (map<char, wchar_t> const & m, wchar_t c)
+{
+	for (map<char, wchar_t>::const_iterator i = m.begin(); i != m.end(); ++i) {
+		if (i->second == c) {
+			return i->first;
+		}
+	}
+
+	return optional<char> ();
+}
+
+string
+sub::utf16_to_iso6937 (wstring s)
+{
+	if (iso6937::diacriticals.empty ()) {
+		make_iso6937_tables ();
+	}
+	
+	/* XXX: slow */
+
+	string o;
+	for (size_t i = 0; i < s.size(); ++i) {
+		optional<char> c = find (iso6937::main, s[i]);
+		if (c) {
+			o += c.get ();
+		} else {
+			for (map<char, map<char, wchar_t> *>::const_iterator j = iso6937::diacriticals.begin(); j != iso6937::diacriticals.end(); ++j) {
+				c = find (*(j->second), s[i]);
+				if (c) {
+					o += j->first;
+					o += c.get ();
+					break;
+				}
+			}
+		}
+	}
+
+	return o;
+}
+
diff --git a/src/iso6937.h b/src/iso6937.h
index 7b85edf..d994987 100644
--- a/src/iso6937.h
+++ b/src/iso6937.h
@@ -20,5 +20,6 @@
 namespace sub {
 
 extern std::wstring iso6937_to_utf16 (std::string);
+extern std::string utf16_to_iso6937 (std::wstring);
 
 };
diff --git a/src/iso6937.py b/src/iso6937.py
index 4719b07..ecce4dc 100644
--- a/src/iso6937.py
+++ b/src/iso6937.py
@@ -94,26 +94,28 @@ namespace iso6937 {
 """
 
 groups = [
-    ('GRAVE', 'grave', 'AEIOUaeiou'),
-    ('ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'),
-    ('CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'),
-    ('TILDE', 'tilde', 'AINOUainou'),
-    ('MACRON', 'macron', 'AEIOUaeiou'),
-    ('BREVE', 'breve', 'AGUagu'),
-    ('DOT ABOVE', 'dot', 'CEGIZcegz'),
-    ('DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'),
-    ('RING ABOVE', 'ring', 'AUau'),
-    ('CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'),
-    ('DOUBLE ACUTE', 'double_acute', 'OUou'),
-    ('OGONEK', 'ogonek', 'AEIUaeui'),
-    ('CARON', 'caron', 'CDELNRSTZcdelnrstz')
+    (0xC1, 'GRAVE', 'grave', 'AEIOUaeiou'),
+    (0xC2, 'ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'),
+    (0xC3, 'CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'),
+    (0xC4, 'TILDE', 'tilde', 'AINOUainou'),
+    (0xC5, 'MACRON', 'macron', 'AEIOUaeiou'),
+    (0xC6, 'BREVE', 'breve', 'AGUagu'),
+    (0xC7, 'DOT ABOVE', 'dot', 'CEGIZcegz'),
+    (0xC8, 'DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'),
+    (0xCA, 'RING ABOVE', 'ring', 'AUau'),
+    (0xCB, 'CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'),
+    (0xCD, 'DOUBLE ACUTE', 'double_acute', 'OUou'),
+    (0xCE, 'OGONEK', 'ogonek', 'AEIUaeui'),
+    (0xCF, 'CARON', 'caron', 'CDELNRSTZcdelnrstz')
 ]
 
 for g in groups:
-    setup(g[1])
+    setup(g[2])
 
 print>>output_c,"map<char, wchar_t> sub::iso6937::main;"
+print>>output_c,"map<char, map<char, wchar_t> *> sub::iso6937::diacriticals;"
 print>>output_h,"extern std::map<char, wchar_t> main;"
+print>>output_h,"extern std::map<char, std::map<char, wchar_t> *> diacriticals;"
 
 print>>output_c,"""
 void
@@ -123,7 +125,7 @@ sub::make_iso6937_tables ()
 """
 
 for g in groups:
-    fill(g[0], g[1], g[2])
+    fill(g[1], g[2], g[3])
 
 print>>output_c,"\tmain[10] = 0x000A;"
 
@@ -220,6 +222,10 @@ print>>output_c,"\tmain[252] = 0x00FE;"
 print>>output_c,"\tmain[253] = 0x0167;"
 print>>output_c,"\tmain[254] = 0x014B;"
 print>>output_c,"\tmain[255] = 0x00AD;"
+print>>output_c,""
+
+for g in groups:
+    print>>output_c,"\tdiacriticals[%s] = &%s;" % (hex(g[0]), g[2])
 
 print>>output_c,"}"
 print>>output_h,""
diff --git a/src/iso6937_tables.cc b/src/iso6937_tables.cc
index 07174c4..b534d4c 100644
--- a/src/iso6937_tables.cc
+++ b/src/iso6937_tables.cc
@@ -38,6 +38,7 @@ map<char, wchar_t> sub::iso6937::double_acute;
 map<char, wchar_t> sub::iso6937::ogonek;
 map<char, wchar_t> sub::iso6937::caron;
 map<char, wchar_t> sub::iso6937::main;
+map<char, map<char, wchar_t> *> sub::iso6937::diacriticals;
 
 void
 sub::make_iso6937_tables ()
@@ -393,4 +394,18 @@ sub::make_iso6937_tables ()
 	main[253] = 0x0167;
 	main[254] = 0x014B;
 	main[255] = 0x00AD;
+
+	diacriticals[0xc1] = &grave;
+	diacriticals[0xc2] = &acute;
+	diacriticals[0xc3] = &circumflex;
+	diacriticals[0xc4] = &tilde;
+	diacriticals[0xc5] = &macron;
+	diacriticals[0xc6] = &breve;
+	diacriticals[0xc7] = &dot;
+	diacriticals[0xc8] = &diaeresis;
+	diacriticals[0xca] = &ring;
+	diacriticals[0xcb] = &cedilla;
+	diacriticals[0xcd] = &double_acute;
+	diacriticals[0xce] = &ogonek;
+	diacriticals[0xcf] = &caron;
 }
diff --git a/src/iso6937_tables.h b/src/iso6937_tables.h
index 58c8c4c..feb13c4 100644
--- a/src/iso6937_tables.h
+++ b/src/iso6937_tables.h
@@ -28,6 +28,7 @@ extern void make_iso6937_tables ();
 namespace iso6937 {
 
 extern std::map<char, wchar_t> main;
+extern std::map<char, std::map<char, wchar_t> *> diacriticals;
 extern std::map<char, wchar_t> grave;
 extern std::map<char, wchar_t> acute;
 extern std::map<char, wchar_t> circumflex;
diff --git a/src/stl_binary_writer.cc b/src/stl_binary_writer.cc
index 334a5bb..f8d2263 100644
--- a/src/stl_binary_writer.cc
+++ b/src/stl_binary_writer.cc
@@ -19,7 +19,9 @@
 
 #include "stl_binary_writer.h"
 #include "subtitle.h"
+#include "iso6937.h"
 #include "compose.hpp"
+#include <boost/locale.hpp>
 #include <list>
 #include <cmath>
 #include <fstream>
@@ -34,6 +36,7 @@ using std::setw;
 using std::setfill;
 using std::max;
 using std::cout;
+using boost::locale::conv::utf_to_utf;
 using namespace sub;
 
 static void
@@ -247,7 +250,7 @@ sub::write_stl_binary (
 					italic = false;
 				}
 
-				text += k->text;
+				text += utf16_to_iso6937 (utf_to_utf<wchar_t> (k->text));
 			}
 
 			text += "\x8A";
diff --git a/test/iso6937_test.cc b/test/iso6937_test.cc
index e8563b8..f537230 100644
--- a/test/iso6937_test.cc
+++ b/test/iso6937_test.cc
@@ -24,7 +24,7 @@
 using std::cout;
 using boost::locale::conv::utf_to_utf;
 
-BOOST_AUTO_TEST_CASE (iso6937_test)
+BOOST_AUTO_TEST_CASE (iso6937_to_utf16_test)
 {
 	BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Hello world")), "Hello world");
 	BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Testing \xA9testing\xB9")), "Testing ‘testing’");
@@ -32,3 +32,12 @@ BOOST_AUTO_TEST_CASE (iso6937_test)
 	BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("M\xC8otorhead")), "Mötorhead");
 	BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough");
 }
+
+BOOST_AUTO_TEST_CASE (utf16_to_iso6937_test)
+{
+	BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Hello world")), "Hello world");
+	BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Testing ‘testing’")), "Testing \xA9testing\xB9");
+	BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("All must have çedillas")), "All must have \xCB""cedillas");
+	BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Mötorhead")), "M\xC8otorhead");
+	BOOST_CHECK_EQUAL (sub::utf16_to_iso6937 (utf_to_utf<wchar_t> ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough");
+}
author	Carl Hetherington <cth@carlh.net>	2014-05-30 16:17:40 +0100
committer	Carl Hetherington <cth@carlh.net>	2014-05-30 16:17:40 +0100
commit	2a85e711df07e8a707cfc50667bd0a29d8a09519 (patch)
tree	b89200eeda155787d91e5cc0ab8044c93b3225cb
parent	a4114c49aeec1e24e0607814a88f0f6a2d6111f5 (diff)