From 15fe326c723b88f779d69fe0ae737d27d69e6e6f Mon Sep 17 00:00:00 2001 From: Carl Hetherington Date: Mon, 27 Feb 2023 23:20:14 +0100 Subject: [PATCH] Strip disallowed characters from subtitles before they get into the XML. --- src/lib/text_decoder.cc | 43 +++++++++++++++++++++++++++++++++++++-- src/lib/text_decoder.h | 2 ++ test/srt_subtitle_test.cc | 24 ++++++++++++++++++++++ test/text_decoder_test.cc | 32 +++++++++++++++++++++++++++++ test/wscript | 1 + 5 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 test/text_decoder_test.cc diff --git a/src/lib/text_decoder.cc b/src/lib/text_decoder.cc index 58f631e59..1ecdcd347 100644 --- a/src/lib/text_decoder.cc +++ b/src/lib/text_decoder.cc @@ -87,6 +87,45 @@ set_forced_appearance(shared_ptr content, StringText& subtitl } +string +TextDecoder::remove_invalid_characters_for_xml(string text) +{ + string output; + + /* https://www.w3.org/TR/REC-xml/#charsets says that XML may only contain 0x9, 0xa, 0xd below 0x32. + * Not sure if we should be doing direct UTF-8 manipulation here. + */ + for (size_t i = 0; i < text.length(); ++i) { + auto const c = text[i]; + if ((c & 0xe0) == 0xc0) { + // start of 2-byte code point + output += c; + output += text[i + 1]; + ++i; + } else if ((c & 0xf0) == 0xe0) { + // start of 3-byte code point + output += c; + output += text[i + 1]; + output += text[i + 2]; + i += 2; + } else if ((c & 0xf8) == 0xf0) { + // start of 4-byte code point + output += c; + output += text[i + 1]; + output += text[i + 2]; + output += text[i + 3]; + i += 3; + } else { + if (c >= 0x20 || c == 0x9 || c == 0xa || c == 0xd) { + output += c; + } + } + } + + return output; +} + + void TextDecoder::emit_plain_start(ContentTime from, vector subtitles, dcp::SubtitleStandard valign_standard) { @@ -99,7 +138,7 @@ TextDecoder::emit_plain_start(ContentTime from, vector subt content()->get_font(subtitle.font().get_value_or("")), valign_standard ); - string_text.set_text(string_text.text()); + string_text.set_text(remove_invalid_characters_for_xml(string_text.text())); set_forced_appearance(content(), string_text); string_texts.push_back(string_text); } @@ -254,7 +293,7 @@ TextDecoder::emit_plain_start (ContentTime from, sub::Subtitle const & sub_subti v_align, 0, dcp::Direction::LTR, - block.text, + remove_invalid_characters_for_xml(block.text), dcp::Effect::NONE, dcp_colour(block.effect_colour.get_value_or(sub::Colour(0, 0, 0))), /* Hack: we should use subtitle.fade_up and subtitle.fade_down here diff --git a/src/lib/text_decoder.h b/src/lib/text_decoder.h index 3b25e54cb..1a7632fd8 100644 --- a/src/lib/text_decoder.h +++ b/src/lib/text_decoder.h @@ -66,6 +66,8 @@ public: return _content; } + static std::string remove_invalid_characters_for_xml(std::string text); + boost::signals2::signal BitmapStart; boost::signals2::signal PlainStart; boost::signals2::signal Stop; diff --git a/test/srt_subtitle_test.cc b/test/srt_subtitle_test.cc index 935462867..63d508b76 100644 --- a/test/srt_subtitle_test.cc +++ b/test/srt_subtitle_test.cc @@ -248,6 +248,30 @@ BOOST_AUTO_TEST_CASE(srt_subtitle_entity) } +/** A control code in a .srt file should not make it into the XML */ +BOOST_AUTO_TEST_CASE(srt_subtitle_control_code) +{ + std::ofstream srt("build/test/srt_subtitle_control_code.srt"); + srt << "1\n"; + srt << "00:00:01,000 -> 00:00:10,000\n"; + srt << "Hello \x0c world\n"; + srt.close(); + + auto content = make_shared("build/test/srt_subtitle_control_code.srt"); + auto film = new_test_film2("srt_subtitle_control_code", { content }); + film->set_interop(false); + content->only_text()->set_use(true); + content->only_text()->set_burn(false); + make_and_verify_dcp ( + film, + { + dcp::VerificationNote::Code::MISSING_SUBTITLE_LANGUAGE, + dcp::VerificationNote::Code::INVALID_SUBTITLE_FIRST_TEXT_TIME, + dcp::VerificationNote::Code::MISSING_CPL_METADATA, + }); +} + + #if 0 /* XXX: this is disabled; there is some difference in font rendering between the test machine and others. diff --git a/test/text_decoder_test.cc b/test/text_decoder_test.cc new file mode 100644 index 000000000..d6cbd4ce1 --- /dev/null +++ b/test/text_decoder_test.cc @@ -0,0 +1,32 @@ +/* + Copyright (C) 2023 Carl Hetherington + + This file is part of DCP-o-matic. + + DCP-o-matic is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + DCP-o-matic is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with DCP-o-matic. If not, see . + +*/ + + +#include "lib/text_decoder.h" +#include + + +BOOST_AUTO_TEST_CASE(strip_invalid_characters_for_xml_test) +{ + BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("hello world"), "hello world"); + BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("hello\x0cworld"), "helloworld"); + BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("𒀖hello\x02worl𒁝d"), "𒀖helloworl𒁝d"); + BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("😀œ´®†¥¨ˆø\x09π¬˚∆\x1a˙©ƒ∂ßåΩ≈ç√∫\x02˜µ≤ユーザーコードa"), "😀œ´®†¥¨ˆø\x09π¬˚∆˙©ƒ∂ßåΩ≈ç√∫˜µ≤ユーザーコードa"); +} diff --git a/test/wscript b/test/wscript index 5e06ed5e2..949f69019 100644 --- a/test/wscript +++ b/test/wscript @@ -149,6 +149,7 @@ def build(bld): subtitle_timing_test.cc subtitle_trim_test.cc test.cc + text_decoder_test.cc threed_test.cc time_calculation_test.cc torture_test.cc -- 2.30.2