diff options
| author | Carl Hetherington <cth@carlh.net> | 2019-01-23 21:44:20 +0000 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2019-01-23 21:44:20 +0000 |
| commit | 014bb19876f5b26b9802fa42b573c333ebc09139 (patch) | |
| tree | 70e0d2f118fe64cb7f1e79037b251b69f0dd5f40 | |
| parent | 1e1836a2010a9cf421736b25aa0a9f30a268a68b (diff) | |
Strip Unicode U+202B (right-to-left-embedding) code; it looks like DoM does RTL (at least) partially correctly without this.
| -rw-r--r-- | src/subrip_reader.cc | 6 | ||||
| -rw-r--r-- | test/subrip_reader_test.cc | 24 |
2 files changed, 30 insertions, 0 deletions
diff --git a/src/subrip_reader.cc b/src/subrip_reader.cc index 02ee20a..b5d0446 100644 --- a/src/subrip_reader.cc +++ b/src/subrip_reader.cc @@ -42,6 +42,7 @@ using boost::lexical_cast; using boost::to_upper; using boost::optional; using boost::function; +using boost::algorithm::replace_all; using namespace sub; /** @param s Subtitle string encoded in UTF-8 */ @@ -233,6 +234,11 @@ SubripReader::convert_line (string t, RawSubtitle& p) } } + /* Strip Unicode U+202B (right-to-left embedding) as sometimes it is rendered + as a missing character. This may be a hack. + */ + replace_all (p.text, "\xe2\x80\xab", ""); + maybe_content (p); } diff --git a/test/subrip_reader_test.cc b/test/subrip_reader_test.cc index f323f6a..c2702c4 100644 --- a/test/subrip_reader_test.cc +++ b/test/subrip_reader_test.cc @@ -501,3 +501,27 @@ BOOST_AUTO_TEST_CASE (subrip_reader_test3) BLOCK ("Both lines are bold AND italic", "Arial", 30, true, true, false); SUB_END (); } + +/** Test reading of a .srt file with RTL text */ +BOOST_AUTO_TEST_CASE (subrip_reader_test4) +{ + boost::filesystem::path p = private_test / "rtl.srt"; + FILE* f = fopen (p.string().c_str(), "r"); + sub::SubripReader reader (f); + fclose (f); + list<sub::Subtitle> subs = sub::collect<std::list<sub::Subtitle> >(reader.subtitles()); + + list<sub::Subtitle>::iterator i = subs.begin (); + std::cout << i->lines.front().blocks.front().text << "\n"; + + std::string const t = i->lines.front().blocks.front().text; + for (size_t i = 0; i < t.length() - 2; ++i) { + /* Check that unicode U+202B (right-to-left embedding) has been stripped */ + unsigned char const a = t[i]; + unsigned char const b = t[i+1]; + unsigned char const c = t[i+2]; + BOOST_CHECK ((a != 0xe2 || b != 0x80 || c != 0xab)); + } + + BOOST_CHECK (t == "- \"(دريه فابينار)\""); +} |
