diff options
| -rw-r--r-- | src/web_vtt_reader.cc | 157 | ||||
| -rw-r--r-- | src/web_vtt_reader.h | 49 | ||||
| -rw-r--r-- | src/wscript | 2 | ||||
| -rw-r--r-- | test/data/test.vtt | 11 | ||||
| -rw-r--r-- | test/webvtt_reader_test.cc | 108 | ||||
| -rw-r--r-- | test/wscript | 1 |
6 files changed, 328 insertions, 0 deletions
diff --git a/src/web_vtt_reader.cc b/src/web_vtt_reader.cc new file mode 100644 index 0000000..2781654 --- /dev/null +++ b/src/web_vtt_reader.cc @@ -0,0 +1,157 @@ +/* + Copyright (C) 2022 Carl Hetherington <cth@carlh.net> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + + +#include "exceptions.h" +#include "subrip_reader.h" +#include "util.h" +#include "web_vtt_reader.h" +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string_regex.hpp> +#include <boost/bind.hpp> +#include <boost/regex.hpp> +#include <iostream> +#include <vector> + + +using std::function; +using std::string; +using std::vector; +using boost::optional; +using namespace sub; + + +WebVTTReader::WebVTTReader(FILE* file) +{ + this->read(boost::bind(&get_line_file, file)); +} + + +WebVTTReader::WebVTTReader(string subs) +{ + this->read(boost::bind(&get_line_string, &subs)); +} + + +void +WebVTTReader::read(std::function<optional<string> ()> get_line) +{ + enum class State { + /* expecting WEBVTT */ + HEADER, + /* awaiting a NOTE or a subtitle timing line */ + DATA, + /* reading the text of a subtitle */ + SUBTITLE, + /* reading a note */ + NOTE + } state = State::HEADER; + + RawSubtitle rs; + + rs.vertical_position.line = 0; + rs.vertical_position.reference = TOP_OF_SUBTITLE; + + while (true) { + auto line = get_line(); + if (!line) { + break; + } + + trim_right_if(*line, boost::is_any_of("\n\r")); + remove_unicode_bom(line); + + /* Keep some history in case there is an error to report */ + _context.push_back(*line); + if (_context.size() > 5) { + _context.pop_front(); + } + + switch (state) { + case State::HEADER: + if (!boost::starts_with(*line, "WEBVTT")) { + throw WebVTTError("No WEBVTT header found"); + } + state = State::DATA; + break; + case State::DATA: + if (boost::starts_with(*line, "NOTE")) { + state = State::NOTE; + } else if (line->find("-->") != string::npos) { + /* Further trim this line, removing spaces from the end */ + trim_right_if(*line, boost::is_any_of(" ")); + + vector<string> parts; + boost::algorithm::split(parts, *line, boost::algorithm::is_any_of(" "), boost::token_compress_on); + + if (parts.size() != 3 && parts.size() != 7) { + for (int i = 0; i < 2; ++i) { + auto ex = get_line(); + if (ex) { + _context.push_back(*ex); + } + } + throw WebVTTError(*line, "a time line", _context); + } + + string expected; + auto from = SubripReader::convert_time(parts[0], &expected); + if (!from) { + throw WebVTTError(parts[0], expected, _context); + } + rs.from = *from; + + auto to = SubripReader::convert_time(parts[2], &expected); + if (!to) { + throw WebVTTError(parts[2], expected, _context); + } + rs.to = *to; + + rs.vertical_position.line = 0; + state = State::SUBTITLE; + } else if (!line->empty()) { + throw WebVTTError(*line, "a note or time", _context); + } + break; + case State::SUBTITLE: + if (line->empty()) { + state = State::DATA; + } else { + /* Split up this line on unicode "LINE SEPARATOR". This feels hacky but also + * the least unpleasant place to do it. + */ + vector<string> sub_lines; + boost::algorithm::split_regex(sub_lines, *line, boost::regex("\xe2\x80\xa8")); + for (auto sub_line: sub_lines) { + rs.text = sub_line; + _subs.push_back(rs); + rs.vertical_position.line = rs.vertical_position.line.get() + 1; + } + } + break; + case State::NOTE: + if (line->empty()) { + state = State::DATA; + } + break; + } + + } +} + diff --git a/src/web_vtt_reader.h b/src/web_vtt_reader.h new file mode 100644 index 0000000..495e2bc --- /dev/null +++ b/src/web_vtt_reader.h @@ -0,0 +1,49 @@ +/* + Copyright (C) 2022 Carl Hetherington <cth@carlh.net> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + + +#ifndef LIBSUB_WEB_VTT_READER_H +#define LIBSUB_WEB_VTT_READER_H + + +#include "reader.h" +#include <cstdio> +#include <string> + + +namespace sub { + + +class WebVTTReader : public Reader +{ +public: + WebVTTReader(FILE* file); + WebVTTReader(std::string subs); + +private: + void read(std::function<boost::optional<std::string> ()> get_line); + + std::list<std::string> _context; +}; + + +} + +#endif + diff --git a/src/wscript b/src/wscript index 4911117..ff029a0 100644 --- a/src/wscript +++ b/src/wscript @@ -38,6 +38,7 @@ def build(bld): util.cc vertical_reference.cc vertical_position.cc + web_vtt_reader.cc """ headers = """ @@ -63,6 +64,7 @@ def build(bld): subtitle.h vertical_position.h vertical_reference.h + web_vtt_reader.h """ bld.install_files('${PREFIX}/include/libsub%s/sub' % bld.env.API_VERSION, headers) diff --git a/test/data/test.vtt b/test/data/test.vtt new file mode 100644 index 0000000..461c8a2 --- /dev/null +++ b/test/data/test.vtt @@ -0,0 +1,11 @@ +WEBVTT - you can put something here + +NOTE You can have notes +That span multiple lines + +00:00:41,090 --> 00:00:42,210 +This is a subtitle +and that's a line break + +00:01:01,010 --> 00:01:02,100 +This is some stuff. diff --git a/test/webvtt_reader_test.cc b/test/webvtt_reader_test.cc new file mode 100644 index 0000000..024f89f --- /dev/null +++ b/test/webvtt_reader_test.cc @@ -0,0 +1,108 @@ +/* + Copyright (C) 2022 Carl Hetherington <cth@carlh.net> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include "web_vtt_reader.h" +#include "subtitle.h" +#include "test.h" +#include "exceptions.h" +#include "collect.h" +#include <boost/test/unit_test.hpp> +#include <boost/filesystem.hpp> +#include <cmath> +#include <iostream> +#include <cstdio> + + +using std::cerr; +using std::vector; +using std::fabs; + + +/* Test reading of a VTT file */ +BOOST_AUTO_TEST_CASE(vtt_reader_test) +{ + auto f = fopen("test/data/test.vtt", "r"); + sub::WebVTTReader reader(f); + fclose(f); + auto subs = sub::collect<std::vector<sub::Subtitle>>(reader.subtitles()); + + auto i = subs.begin(); + + + /* First subtitle */ + + BOOST_REQUIRE(i != subs.end()); + BOOST_CHECK_EQUAL(i->from, sub::Time::from_hms(0, 0, 41, 90)); + BOOST_CHECK_EQUAL(i->to, sub::Time::from_hms(0, 0, 42, 210)); + + auto j = i->lines.begin(); + BOOST_CHECK(j != i->lines.end()); + BOOST_REQUIRE_EQUAL(j->blocks.size(), 1); + auto b = j->blocks[0]; + BOOST_CHECK_EQUAL(b.text, "This is a subtitle"); + /* No font is specified by WebVTT, so none should be seen here */ + BOOST_CHECK(!b.font); + BOOST_CHECK(!b.font_size.specified()); + BOOST_CHECK_EQUAL(b.bold, false); + BOOST_CHECK_EQUAL(b.italic, false); + BOOST_CHECK_EQUAL(b.underline, false); + BOOST_REQUIRE(j->vertical_position.line); + BOOST_CHECK_EQUAL(j->vertical_position.line.get(), 0); + BOOST_CHECK_EQUAL(j->vertical_position.reference.get(), sub::TOP_OF_SUBTITLE); + ++j; + + BOOST_CHECK(j != i->lines.end()); + BOOST_REQUIRE_EQUAL(j->blocks.size(), 1); + b = j->blocks[0]; + BOOST_CHECK_EQUAL(b.text, "and that's a line break"); + /* No font is specified by WebVTT, so none should be seen here */ + BOOST_CHECK(!b.font); + BOOST_CHECK(!b.font_size.specified()); + BOOST_CHECK_EQUAL(b.bold, false); + BOOST_CHECK_EQUAL(b.italic, false); + BOOST_CHECK_EQUAL(b.underline, false); + BOOST_REQUIRE(j->vertical_position.line); + BOOST_CHECK_EQUAL(j->vertical_position.line.get(), 1); + BOOST_CHECK_EQUAL(j->vertical_position.reference.get(), sub::TOP_OF_SUBTITLE); + ++i; + + + /* Second subtitle */ + + BOOST_REQUIRE(i != subs.end()); + BOOST_CHECK_EQUAL(i->from, sub::Time::from_hms(0, 1, 1, 10)); + BOOST_CHECK_EQUAL(i->to, sub::Time::from_hms(0, 1, 2, 100)); + + BOOST_CHECK_EQUAL(i->lines.size(), 1); + sub::Line l = i->lines[0]; + BOOST_CHECK_EQUAL(l.blocks.size(), 1); + BOOST_CHECK_EQUAL(l.vertical_position.line.get(), 0); + BOOST_CHECK_EQUAL(l.vertical_position.reference.get(), sub::TOP_OF_SUBTITLE); + + BOOST_REQUIRE_EQUAL(l.blocks.size(), 1U); + b = l.blocks[0]; + BOOST_CHECK_EQUAL(b.text, "This is some stuff."); + /* No font is specified by WebVTT, so none should be seen here */ + BOOST_CHECK(!b.font); + BOOST_CHECK(!b.font_size.specified()); + BOOST_CHECK_EQUAL(b.bold, false); + BOOST_CHECK_EQUAL(b.italic, false); + BOOST_CHECK_EQUAL(b.underline, false); +} + diff --git a/test/wscript b/test/wscript index b7d91cb..4653bd0 100644 --- a/test/wscript +++ b/test/wscript @@ -31,6 +31,7 @@ def build(bld): time_test.cc test.cc vertical_position_test.cc + webvtt_reader_test.cc """ obj.target = 'tests' obj.install_path = '' |
