Strip disallowed characters from subtitles before they get into the XML.
authorCarl Hetherington <cth@carlh.net>
Mon, 27 Feb 2023 22:20:14 +0000 (23:20 +0100)
committerCarl Hetherington <cth@carlh.net>
Mon, 27 Feb 2023 22:20:14 +0000 (23:20 +0100)
src/lib/text_decoder.cc
src/lib/text_decoder.h
test/srt_subtitle_test.cc
test/text_decoder_test.cc [new file with mode: 0644]
test/wscript

index 58f631e59124855aae836432d4eac5f7ff67e03c..1ecdcd34783ae8050fbb6c282d27a511aabcef44 100644 (file)
@@ -87,6 +87,45 @@ set_forced_appearance(shared_ptr<const TextContent> content, StringText& subtitl
 }
 
 
+string
+TextDecoder::remove_invalid_characters_for_xml(string text)
+{
+       string output;
+
+       /* https://www.w3.org/TR/REC-xml/#charsets says that XML may only contain 0x9, 0xa, 0xd below 0x32.
+        * Not sure if we should be doing direct UTF-8 manipulation here.
+        */
+       for (size_t i = 0; i < text.length(); ++i) {
+               auto const c = text[i];
+               if ((c & 0xe0) == 0xc0) {
+                       // start of 2-byte code point
+                       output += c;
+                       output += text[i + 1];
+                       ++i;
+               } else if ((c & 0xf0) == 0xe0) {
+                       // start of 3-byte code point
+                       output += c;
+                       output += text[i + 1];
+                       output += text[i + 2];
+                       i += 2;
+               } else if ((c & 0xf8) == 0xf0) {
+                       // start of 4-byte code point
+                       output += c;
+                       output += text[i + 1];
+                       output += text[i + 2];
+                       output += text[i + 3];
+                       i += 3;
+               } else {
+                       if (c >= 0x20 || c == 0x9 || c == 0xa || c == 0xd) {
+                               output += c;
+                       }
+               }
+       }
+
+       return output;
+}
+
+
 void
 TextDecoder::emit_plain_start(ContentTime from, vector<dcp::SubtitleString> subtitles, dcp::SubtitleStandard valign_standard)
 {
@@ -99,7 +138,7 @@ TextDecoder::emit_plain_start(ContentTime from, vector<dcp::SubtitleString> subt
                        content()->get_font(subtitle.font().get_value_or("")),
                        valign_standard
                        );
-               string_text.set_text(string_text.text());
+               string_text.set_text(remove_invalid_characters_for_xml(string_text.text()));
                set_forced_appearance(content(), string_text);
                string_texts.push_back(string_text);
        }
@@ -254,7 +293,7 @@ TextDecoder::emit_plain_start (ContentTime from, sub::Subtitle const & sub_subti
                                v_align,
                                0,
                                dcp::Direction::LTR,
-                               block.text,
+                               remove_invalid_characters_for_xml(block.text),
                                dcp::Effect::NONE,
                                dcp_colour(block.effect_colour.get_value_or(sub::Colour(0, 0, 0))),
                                /* Hack: we should use subtitle.fade_up and subtitle.fade_down here
index 3b25e54cbe1a6c701d7f6407e77042d51c71eee4..1a7632fd82b8448366dddc0676531c8b04fd9a27 100644 (file)
@@ -66,6 +66,8 @@ public:
                return _content;
        }
 
+       static std::string remove_invalid_characters_for_xml(std::string text);
+
        boost::signals2::signal<void (ContentBitmapText)> BitmapStart;
        boost::signals2::signal<void (ContentStringText)> PlainStart;
        boost::signals2::signal<void (dcpomatic::ContentTime)> Stop;
index 9354628678742c259606a5594f0f9701aaf0cd69..63d508b7654ba4093c6492261dd1bada2b98ebae 100644 (file)
@@ -248,6 +248,30 @@ BOOST_AUTO_TEST_CASE(srt_subtitle_entity)
 }
 
 
+/** A control code in a .srt file should not make it into the XML */
+BOOST_AUTO_TEST_CASE(srt_subtitle_control_code)
+{
+       std::ofstream srt("build/test/srt_subtitle_control_code.srt");
+       srt << "1\n";
+       srt << "00:00:01,000 -> 00:00:10,000\n";
+       srt << "Hello \x0c world\n";
+       srt.close();
+
+       auto content = make_shared<StringTextFileContent>("build/test/srt_subtitle_control_code.srt");
+       auto film = new_test_film2("srt_subtitle_control_code", { content });
+       film->set_interop(false);
+       content->only_text()->set_use(true);
+       content->only_text()->set_burn(false);
+       make_and_verify_dcp (
+               film,
+               {
+                       dcp::VerificationNote::Code::MISSING_SUBTITLE_LANGUAGE,
+                       dcp::VerificationNote::Code::INVALID_SUBTITLE_FIRST_TEXT_TIME,
+                       dcp::VerificationNote::Code::MISSING_CPL_METADATA,
+               });
+}
+
+
 #if 0
 /* XXX: this is disabled; there is some difference in font rendering
    between the test machine and others.
diff --git a/test/text_decoder_test.cc b/test/text_decoder_test.cc
new file mode 100644 (file)
index 0000000..d6cbd4c
--- /dev/null
@@ -0,0 +1,32 @@
+/*
+    Copyright (C) 2023 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "lib/text_decoder.h"
+#include <boost/test/unit_test.hpp>
+
+
+BOOST_AUTO_TEST_CASE(strip_invalid_characters_for_xml_test)
+{
+       BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("hello world"), "hello world");
+       BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("hello\x0cworld"), "helloworld");
+       BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("𒀖hello\x02worl𒁝d"), "𒀖helloworl𒁝d");
+       BOOST_CHECK_EQUAL(TextDecoder::remove_invalid_characters_for_xml("😀œ´®†¥¨ˆø\x09π¬˚∆\x1a˙©ƒ∂ßåΩ≈ç√∫\x02˜µ≤ユーザーコードa"), "😀œ´®†¥¨ˆø\x09π¬˚∆˙©ƒ∂ßåΩ≈ç√∫˜µ≤ユーザーコードa");
+}
index 5e06ed5e21131d5da406bcdbf1f5738995655114..949f69019eb458a2011053a3d82b54491935316e 100644 (file)
@@ -149,6 +149,7 @@ def build(bld):
                  subtitle_timing_test.cc
                  subtitle_trim_test.cc
                  test.cc
+                 text_decoder_test.cc
                  threed_test.cc
                  time_calculation_test.cc
                  torture_test.cc