/*
- Copyright (C) 2014-2015 Carl Hetherington <cth@carlh.net>
+ Copyright (C) 2014-2020 Carl Hetherington <cth@carlh.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
* @brief SubripReader class.
*/
-#include "subrip_reader.h"
+
+#include "compose.hpp"
#include "exceptions.h"
+#include "raw_convert.h"
+#include "ssa_reader.h"
+#include "sub_assert.h"
+#include "subrip_reader.h"
#include "util.h"
-#include <locked_sstream.h>
#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string_regex.hpp>
+#include <boost/bind.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/regex.hpp>
-#include <boost/bind.hpp>
#include <cstdio>
-#include <vector>
#include <iostream>
+#include <vector>
+
using std::string;
using std::vector;
-using std::list;
using std::cout;
using std::hex;
using boost::lexical_cast;
using boost::to_upper;
using boost::optional;
using boost::function;
+using boost::algorithm::replace_all;
using namespace sub;
/** @param s Subtitle string encoded in UTF-8 */
-SubripReader::SubripReader (string const & s)
+SubripReader::SubripReader (string s)
{
- locked_stringstream str (s);
- this->read (boost::bind (&get_line_stringstream, &str));
+ this->read (boost::bind(&get_line_string, &s));
}
/** @param f Subtitle file encoded in UTF-8 */
RawSubtitle rs;
- /* This reader extracts no information about where the subtitle
- should be on screen, so its reference is TOP_OF_SUBTITLE.
- */
rs.vertical_position.line = 0;
rs.vertical_position.reference = TOP_OF_SUBTITLE;
while (true) {
- optional<string> line = get_line ();
+ auto line = get_line ();
if (!line) {
break;
}
rs.italic = false;
rs.underline = false;
rs.vertical_position.line = 0;
+ rs.vertical_position.reference = TOP_OF_SUBTITLE;
}
break;
case METADATA:
throw SubripError (*line, "a time/position line", _context);
}
- rs.from = convert_time (p[0]);
- rs.to = convert_time (p[2]);
+ string expected;
+ auto from = convert_time(p[0], ",", &expected);
+ if (!from) {
+ throw SubripError(p[0], expected, _context);
+ }
+ rs.from = *from;
+
+ auto to = convert_time(p[2], ",", &expected);
+ if (!to) {
+ throw SubripError(p[2], expected, _context);
+ }
+ rs.to = *to;
/* XXX: should not ignore coordinate specifications */
if (line->empty ()) {
state = COUNTER;
} else {
- convert_line (*line, rs);
- rs.vertical_position.line = rs.vertical_position.line.get() + 1;
+ vector<string> sub_lines;
+ /* Split up this line on unicode "LINE SEPARATOR". This feels hacky but also
+ * the least unpleasant place to do it.
+ */
+ boost::algorithm::split_regex(sub_lines, *line, boost::regex("\xe2\x80\xa8"));
+ for (auto sub_line: sub_lines) {
+ convert_line(sub_line, rs);
+ rs.vertical_position.line = rs.vertical_position.line.get() + 1;
+ rs.text.clear();
+ }
}
break;
}
}
}
-Time
-SubripReader::convert_time (string t)
+optional<Time>
+SubripReader::convert_time(string t, string milliseconds_separator, string* expected)
{
+ auto report_expected = [expected](string const& s) {
+ if (expected) {
+ *expected = s;
+ }
+ };
+
vector<string> a;
boost::algorithm::split (a, t, boost::is_any_of (":"));
if (a.size() != 3) {
- throw SubripError (t, "time in the format h:m:s,ms", _context);
+ report_expected("time in the format h:m:s,ms");
+ return {};
}
vector<string> b;
- boost::algorithm::split (b, a[2], boost::is_any_of (","));
-
- return Time::from_hms (
- lexical_cast<int> (a[0]),
- lexical_cast<int> (a[1]),
- lexical_cast<int> (b[0]),
- lexical_cast<int> (b[1])
- );
+ boost::algorithm::split(b, a[2], boost::is_any_of(milliseconds_separator));
+ if (b.size() != 2) {
+ report_expected(String::compose("time in the format h:m:s%1ms", milliseconds_separator));
+ return {};
+ }
+
+ int h, m, s, ms;
+
+ try {
+ h = lexical_cast<int>(a[0]);
+ } catch (boost::bad_lexical_cast &) {
+ report_expected("integer hour value");
+ return {};
+ }
+
+ try {
+ m = lexical_cast<int>(a[1]);
+ } catch (boost::bad_lexical_cast &) {
+ report_expected("integer minute value");
+ return {};
+ }
+
+ try {
+ s = lexical_cast<int>(b[0]);
+ } catch (boost::bad_lexical_cast &) {
+ report_expected("integer second value");
+ return {};
+ }
+
+ try {
+ ms = lexical_cast<int>(b[1]);
+ } catch (boost::bad_lexical_cast &) {
+ report_expected("integer millisecond value");
+ return {};
+ }
+
+ return Time::from_hms (h, m, s, ms);
}
void
SubripReader::convert_line (string t, RawSubtitle& p)
{
- enum {
- TEXT,
- TAG
- } state = TEXT;
-
- string tag;
-
- list<Colour> colours;
+ vector<Colour> colours;
colours.push_back (Colour (1, 1, 1));
- /* XXX: missing <font> support */
- /* XXX: nesting of tags e.g. <b>foo<i>bar<b>baz</b>fred</i>jim</b> might
- not work, I think.
- */
+ auto has_next = [](string line, size_t& index, string s) {
+ boost::to_lower(s);
+ auto next = line.substr(index, s.size());
+ boost::to_lower(next);
+ if (next != s) {
+ return false;
+ }
- for (size_t i = 0; i < t.size(); ++i) {
- switch (state) {
- case TEXT:
- if (t[i] == '<' || t[i] == '{') {
- state = TAG;
- } else {
- p.text += t[i];
+ index += s.size();
+ return true;
+ };
+
+ size_t i = 0;
+ while (i < t.size()) {
+ if (has_next(t, i, "<b>") || has_next(t, i, "{b}")) {
+ maybe_content (p);
+ p.bold = true;
+ } else if (has_next(t, i, "</b>") || has_next(t, i, "{/b}")) {
+ maybe_content (p);
+ p.bold = false;
+ } else if (has_next(t, i, "<i>") || has_next(t, i, "{i}")) {
+ maybe_content (p);
+ p.italic = true;
+ } else if (has_next(t, i, "</i>") || has_next(t, i, "{/i}")) {
+ maybe_content (p);
+ p.italic = false;
+ } else if (has_next(t, i, "<u>") || has_next(t, i, "{u}")) {
+ maybe_content (p);
+ p.underline = true;
+ } else if (has_next(t, i, "</u>") || has_next(t, i, "{/u}")) {
+ maybe_content (p);
+ p.underline = false;
+ } else if (has_next(t, i, "<font") || has_next(t, i, "<Font")) {
+ maybe_content (p);
+ boost::regex re (".*color=\"?#([[:xdigit:]]+)\"?");
+ boost::smatch match;
+ string tag;
+ while (i < t.size() && t[i] != '>') {
+ tag += t[i];
+ ++i;
}
- break;
- case TAG:
- if (t[i] == '>' || t[i] == '}') {
- if (tag == "b") {
- maybe_content (p);
- p.bold = true;
- } else if (tag == "/b") {
- maybe_content (p);
- p.bold = false;
- } else if (tag == "i") {
- maybe_content (p);
- p.italic = true;
- } else if (tag == "/i") {
- maybe_content (p);
- p.italic = false;
- } else if (tag == "u") {
- maybe_content (p);
- p.underline = true;
- } else if (tag == "/u") {
- maybe_content (p);
- p.underline = false;
- } else if (boost::starts_with (tag, "font")) {
- maybe_content (p);
- boost::regex re (".*color=\"#([0123456789abcdef]+)\"");
- boost::smatch match;
- if (boost::regex_search (tag, match, re) && string (match[1]).size() == 6) {
- p.colour = Colour::from_rgb_hex (match[1]);
- colours.push_back (p.colour);
- }
- } else if (tag == "/font") {
- maybe_content (p);
- colours.pop_back ();
- p.colour = colours.back ();
- }
- tag.clear ();
- state = TEXT;
+ ++i;
+ if (boost::regex_search (tag, match, re) && string (match[1]).size() == 6) {
+ p.colour = Colour::from_rgb_hex (match[1]);
+ colours.push_back (p.colour);
} else {
- tag += t[i];
+ re = boost::regex (
+ ".*color=\"rgba\\("
+ "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
+ "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
+ "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
+ "[[:space:]]*([[:digit:]]+)[[:space:]]*"
+ "\\)\""
+ );
+ if (boost::regex_search (tag, match, re) && match.size() == 5) {
+ p.colour.r = raw_convert<int>(string(match[1])) / 255.0;
+ p.colour.g = raw_convert<int>(string(match[2])) / 255.0;
+ p.colour.b = raw_convert<int>(string(match[3])) / 255.0;
+ colours.push_back (p.colour);
+ } else {
+ throw SubripError (tag, "a colour in the format #rrggbb or rgba(rr,gg,bb,aa)", _context);
+ }
}
- break;
+ } else if (has_next(t, i, "</font>")) {
+ maybe_content (p);
+ SUB_ASSERT (!colours.empty());
+ colours.pop_back ();
+ p.colour = colours.back ();
+ } else if (has_next(t, i, "{\\")) {
+ string ssa = "\\";
+ while (i < t.size() && t[i] != '}') {
+ ssa += t[i];
+ ++i;
+ }
+ ++i;
+ SSAReader::parse_style (p, ssa, 288, 288, Colour(1, 1, 1));
+ } else {
+ p.text += t[i];
+ ++i;
}
}
+ /* Strip Unicode U+202B (right-to-left embedding) as sometimes it is rendered
+ as a missing character. This may be a hack.
+ */
+ replace_all (p.text, "\xe2\x80\xab", "");
+
maybe_content (p);
}