summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2015-09-22 16:15:08 +0100
committerCarl Hetherington <cth@carlh.net>2015-09-22 16:15:08 +0100
commitbdbe925a467f9b7149322ad8d1c090d4c1e6d5c3 (patch)
tree007f81fe339a08e11afea6b567b79d75a11ad41f /src
parenta9370ddf1d55ca01307c086950d2294611d9e6a6 (diff)
Use uchardet to guess encoding of subtitle files and reject non-UTF-8.
Diffstat (limited to 'src')
-rw-r--r--src/lib/exceptions.h8
-rw-r--r--src/lib/subrip.cc23
-rw-r--r--src/tools/wscript1
3 files changed, 32 insertions, 0 deletions
diff --git a/src/lib/exceptions.h b/src/lib/exceptions.h
index 7240611ee..6939f81a3 100644
--- a/src/lib/exceptions.h
+++ b/src/lib/exceptions.h
@@ -263,4 +263,12 @@ public:
ProgrammingError (std::string file, int line);
};
+class TextEncodingError : public StringError
+{
+public:
+ TextEncodingError (std::string s)
+ : StringError (s)
+ {}
+};
+
#endif
diff --git a/src/lib/subrip.cc b/src/lib/subrip.cc
index f19867952..d4adee428 100644
--- a/src/lib/subrip.cc
+++ b/src/lib/subrip.cc
@@ -23,10 +23,14 @@
#include "subrip_content.h"
#include <sub/subrip_reader.h>
#include <sub/collect.h>
+#include <uchardet/uchardet.h>
+#include <iostream>
#include "i18n.h"
using std::vector;
+using std::cout;
+using std::string;
using boost::shared_ptr;
SubRip::SubRip (shared_ptr<const SubRipContent> content)
@@ -36,6 +40,25 @@ SubRip::SubRip (shared_ptr<const SubRipContent> content)
throw OpenFileError (content->path (0));
}
+ /* Guess the encoding */
+ uchardet_t det = uchardet_new ();
+ char buffer[1024];
+ while (!feof (f)) {
+ int const n = fread (buffer, 1, sizeof (buffer), f);
+ if (uchardet_handle_data (det, buffer, n)) {
+ break;
+ }
+ }
+
+ uchardet_data_end (det);
+ string charset = uchardet_get_charset (det);
+ uchardet_delete (det);
+
+ if (charset != "UTF-8") {
+ throw TextEncodingError (_("unrecognised character set; please use files encoded in UTF-8"));
+ }
+
+ rewind (f);
sub::SubripReader reader (f);
_subtitles = sub::collect<vector<sub::Subtitle> > (reader.subtitles ());
}
diff --git a/src/tools/wscript b/src/tools/wscript
index 33a631e6e..b01eee7ca 100644
--- a/src/tools/wscript
+++ b/src/tools/wscript
@@ -29,6 +29,7 @@ def configure(conf):
def build(bld):
uselib = 'BOOST_THREAD BOOST_DATETIME OPENJPEG DCP XMLSEC CXML XMLPP AVFORMAT AVFILTER AVCODEC '
uselib += 'AVUTIL SWSCALE POSTPROC CURL BOOST_FILESYSTEM SSH ZIP CAIROMM FONTCONFIG PANGOMM SUB MAGICK SNDFILE SAMPLERATE BOOST_REGEX '
+ uselib += 'UCHARDET '
if bld.env.TARGET_WINDOWS:
uselib += 'WINSOCK2'