Skip Unicode BOM at the start of subrip files.
[libsub.git] / src / subrip_reader.cc
1 /*
2     Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18 */
19
20 #include "subrip_reader.h"
21 #include "exceptions.h"
22 #include <boost/algorithm/string.hpp>
23 #include <boost/lexical_cast.hpp>
24 #include <cstdio>
25 #include <vector>
26
27 using std::string;
28 using std::vector;
29 using boost::lexical_cast;
30 using namespace sub;
31
32 SubripReader::SubripReader (FILE* f)
33 {
34         enum {
35                 COUNTER,
36                 METADATA,
37                 CONTENT
38         } state = COUNTER;
39
40         char buffer[256];
41
42         Time from;
43         Time to;
44
45         string line;
46         int line_number = 0;
47
48         while (!feof (f)) {
49                 char* r = fgets (buffer, sizeof (buffer), f);
50                 if (r == 0 || feof (f)) {
51                         break;
52                 }
53
54                 line = string (buffer);
55                 trim_right_if (line, boost::is_any_of ("\n\r"));
56
57                 if (
58                         line.length() >= 3 &&
59                         static_cast<unsigned char> (line[0]) == 0xef &&
60                         static_cast<unsigned char> (line[1]) == 0xbb &&
61                         static_cast<unsigned char> (line[2]) == 0xbf
62                         ) {
63                         
64                         /* Skip Unicode byte order mark */
65                         line = line.substr (3);
66                 }
67
68                 switch (state) {
69                 case COUNTER:
70                 {
71                         if (line.empty ()) {
72                                 /* a blank line at the start is ok */
73                                 break;
74                         }
75
76                         state = METADATA;
77                 }
78                 break;
79                 case METADATA:
80                 {
81                         vector<string> p;
82                         boost::algorithm::split (p, line, boost::algorithm::is_any_of (" "));
83                         if (p.size() != 3 && p.size() != 7) {
84                                 throw SubripError (line, "a time/position line");
85                         }
86
87                         from = convert_time (p[0]);
88                         to = convert_time (p[2]);
89
90                         /* XXX: should not ignore coordinate specifications */
91                         
92                         state = CONTENT;
93                         break;
94                 }
95                 case CONTENT:
96                         if (line.empty ()) {
97                                 state = COUNTER;
98                                 line_number = 0;
99                         } else {
100                                 convert_line (line, line_number, from, to);
101                                 line_number++;
102                         }
103                         break;
104                 }
105         }
106 }
107
108 Time
109 SubripReader::convert_time (string t)
110 {
111         vector<string> a;
112         boost::algorithm::split (a, t, boost::is_any_of (":"));
113         if (a.size() != 3) {
114                 throw SubripError (t, "time in the format h:m:s,ms");
115         }
116
117         vector<string> b;
118         boost::algorithm::split (b, a[2], boost::is_any_of (","));
119
120         return Time::from_hms (
121                 lexical_cast<int> (a[0]),
122                 lexical_cast<int> (a[1]),
123                 lexical_cast<int> (b[0]),
124                 lexical_cast<int> (b[1])
125                 );
126 }
127
128 void
129 SubripReader::convert_line (string t, int line_number, Time from, Time to)
130 {
131         enum {
132                 TEXT,
133                 TAG
134         } state = TEXT;
135         
136         string tag;
137
138         RawSubtitle p;
139         p.font = "Arial";
140         p.font_size.set_points (48);
141         p.from = from;
142         p.to = to;
143         p.vertical_position.line = line_number;
144         /* XXX: arbitrary */
145         p.vertical_position.lines = 32;
146         p.vertical_position.reference = TOP_OF_SUBTITLE;
147         
148         /* XXX: missing <font> support */
149         /* XXX: nesting of tags e.g. <b>foo<i>bar<b>baz</b>fred</i>jim</b> might
150            not work, I think.
151         */
152
153         for (size_t i = 0; i < t.size(); ++i) {
154                 switch (state) {
155                 case TEXT:
156                         if (t[i] == '<' || t[i] == '{') {
157                                 state = TAG;
158                         } else {
159                                 p.text += t[i];
160                         }
161                         break;
162                 case TAG:
163                         if (t[i] == '>' || t[i] == '}') {
164                                 if (tag == "b") {
165                                         maybe_content (p);
166                                         p.bold = true;
167                                 } else if (tag == "/b") {
168                                         maybe_content (p);
169                                         p.bold = false;
170                                 } else if (tag == "i") {
171                                         maybe_content (p);
172                                         p.italic = true;
173                                 } else if (tag == "/i") {
174                                         maybe_content (p);
175                                         p.italic = false;
176                                 } else if (tag == "u") {
177                                         maybe_content (p);
178                                         p.underline = true;
179                                 } else if (tag == "/u") {
180                                         maybe_content (p);
181                                         p.underline = false;
182                                 }
183                                 tag.clear ();
184                                 state = TEXT;
185                         } else {
186                                 tag += t[i];
187                         }
188                         break;
189                 }
190         }
191
192         maybe_content (p);
193 }
194
195 void
196 SubripReader::maybe_content (RawSubtitle& p)
197 {
198         if (!p.text.empty ()) {
199                 _subs.push_back (p);
200                 p.text.clear ();
201         }
202 }