summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2019-01-23 21:44:20 +0000
committerCarl Hetherington <cth@carlh.net>2019-01-23 21:44:20 +0000
commit014bb19876f5b26b9802fa42b573c333ebc09139 (patch)
tree70e0d2f118fe64cb7f1e79037b251b69f0dd5f40
parent1e1836a2010a9cf421736b25aa0a9f30a268a68b (diff)
Strip Unicode U+202B (right-to-left-embedding) code; it looks like DoM does RTL (at least) partially correctly without this.
-rw-r--r--src/subrip_reader.cc6
-rw-r--r--test/subrip_reader_test.cc24
2 files changed, 30 insertions, 0 deletions
diff --git a/src/subrip_reader.cc b/src/subrip_reader.cc
index 02ee20a..b5d0446 100644
--- a/src/subrip_reader.cc
+++ b/src/subrip_reader.cc
@@ -42,6 +42,7 @@ using boost::lexical_cast;
using boost::to_upper;
using boost::optional;
using boost::function;
+using boost::algorithm::replace_all;
using namespace sub;
/** @param s Subtitle string encoded in UTF-8 */
@@ -233,6 +234,11 @@ SubripReader::convert_line (string t, RawSubtitle& p)
}
}
+ /* Strip Unicode U+202B (right-to-left embedding) as sometimes it is rendered
+ as a missing character. This may be a hack.
+ */
+ replace_all (p.text, "\xe2\x80\xab", "");
+
maybe_content (p);
}
diff --git a/test/subrip_reader_test.cc b/test/subrip_reader_test.cc
index f323f6a..c2702c4 100644
--- a/test/subrip_reader_test.cc
+++ b/test/subrip_reader_test.cc
@@ -501,3 +501,27 @@ BOOST_AUTO_TEST_CASE (subrip_reader_test3)
BLOCK ("Both lines are bold AND italic", "Arial", 30, true, true, false);
SUB_END ();
}
+
+/** Test reading of a .srt file with RTL text */
+BOOST_AUTO_TEST_CASE (subrip_reader_test4)
+{
+ boost::filesystem::path p = private_test / "rtl.srt";
+ FILE* f = fopen (p.string().c_str(), "r");
+ sub::SubripReader reader (f);
+ fclose (f);
+ list<sub::Subtitle> subs = sub::collect<std::list<sub::Subtitle> >(reader.subtitles());
+
+ list<sub::Subtitle>::iterator i = subs.begin ();
+ std::cout << i->lines.front().blocks.front().text << "\n";
+
+ std::string const t = i->lines.front().blocks.front().text;
+ for (size_t i = 0; i < t.length() - 2; ++i) {
+ /* Check that unicode U+202B (right-to-left embedding) has been stripped */
+ unsigned char const a = t[i];
+ unsigned char const b = t[i+1];
+ unsigned char const c = t[i+2];
+ BOOST_CHECK ((a != 0xe2 || b != 0x80 || c != 0xab));
+ }
+
+ BOOST_CHECK (t == "- \"(دريه فابينار)\"");
+}