Fix format of versions in .pc files.
[libsub.git] / src / web_vtt_reader.cc
1 /*
2     Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18 */
19
20
21 #include "exceptions.h"
22 #include "subrip_reader.h"
23 #include "util.h"
24 #include "web_vtt_reader.h"
25 #include <boost/algorithm/string.hpp>
26 #include <boost/algorithm/string_regex.hpp>
27 #include <boost/bind.hpp>
28 #include <boost/regex.hpp>
29 #include <iostream>
30 #include <vector>
31
32
33 using std::function;
34 using std::string;
35 using std::vector;
36 using boost::optional;
37 using namespace sub;
38
39
40 WebVTTReader::WebVTTReader(FILE* file)
41 {
42         this->read(boost::bind(&get_line_file, file));
43 }
44
45
46 WebVTTReader::WebVTTReader(string subs)
47 {
48         this->read(boost::bind(&get_line_string, &subs));
49 }
50
51
52 void
53 WebVTTReader::read(std::function<optional<string> ()> get_line)
54 {
55         enum class State {
56                 /* expecting WEBVTT */
57                 HEADER,
58                 /* awaiting a NOTE, some other metadata, or a subtitle timing line */
59                 DATA,
60                 /* reading the text of a subtitle */
61                 SUBTITLE,
62                 /* reading a note */
63                 NOTE
64         } state = State::HEADER;
65
66         RawSubtitle rs;
67
68         rs.vertical_position.line = 0;
69         rs.vertical_position.reference = TOP_OF_SUBTITLE;
70
71         while (true) {
72                 auto line = get_line();
73                 if (!line) {
74                         break;
75                 }
76
77                 trim_right_if(*line, boost::is_any_of("\n\r"));
78                 remove_unicode_bom(line);
79
80                 /* Keep some history in case there is an error to report */
81                 _context.push_back(*line);
82                 if (_context.size() > 5) {
83                         _context.pop_front();
84                 }
85
86                 switch (state) {
87                 case State::HEADER:
88                         if (!boost::starts_with(*line, "WEBVTT")) {
89                                 throw WebVTTError("No WEBVTT header found");
90                         }
91                         state = State::DATA;
92                         break;
93                 case State::DATA:
94                         if (boost::starts_with(*line, "NOTE")) {
95                                 state = State::NOTE;
96                         } else if (line->find("-->") != string::npos) {
97                                 /* Further trim this line, removing spaces from the end */
98                                 trim_right_if(*line, boost::is_any_of(" "));
99
100                                 vector<string> parts;
101                                 boost::algorithm::split(parts, *line, boost::algorithm::is_any_of(" "), boost::token_compress_on);
102
103                                 if (parts.size() != 3 && parts.size() != 7) {
104                                         for (int i = 0; i < 2; ++i) {
105                                                 auto ex = get_line();
106                                                 if (ex) {
107                                                         _context.push_back(*ex);
108                                                 }
109                                         }
110                                         throw WebVTTError(*line, "a time line", _context);
111                                 }
112
113                                 string expected;
114                                 auto from = SubripReader::convert_time(parts[0], ".", &expected);
115                                 if (!from) {
116                                         throw WebVTTError(parts[0], expected, _context);
117                                 }
118                                 rs.from = *from;
119
120                                 auto to = SubripReader::convert_time(parts[2], ".", &expected);
121                                 if (!to) {
122                                         throw WebVTTError(parts[2], expected, _context);
123                                 }
124                                 rs.to = *to;
125
126                                 rs.vertical_position.line = 0;
127                                 state = State::SUBTITLE;
128                         }
129                         break;
130                 case State::SUBTITLE:
131                         if (line->empty()) {
132                                 state = State::DATA;
133                         } else {
134                                 /* Split up this line on unicode "LINE SEPARATOR".  This feels hacky but also
135                                  * the least unpleasant place to do it.
136                                  */
137                                 vector<string> sub_lines;
138                                 boost::algorithm::split_regex(sub_lines, *line, boost::regex("\xe2\x80\xa8"));
139                                 for (auto sub_line: sub_lines) {
140                                         rs.text = sub_line;
141                                         _subs.push_back(rs);
142                                         rs.vertical_position.line = rs.vertical_position.line.get() + 1;
143                                 }
144                         }
145                         break;
146                 case State::NOTE:
147                         if (line->empty()) {
148                                 state = State::DATA;
149                         }
150                         break;
151                 }
152
153         }
154 }
155