summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2015-09-22 16:15:08 +0100
committerCarl Hetherington <cth@carlh.net>2015-09-22 16:15:08 +0100
commitbdbe925a467f9b7149322ad8d1c090d4c1e6d5c3 (patch)
tree007f81fe339a08e11afea6b567b79d75a11ad41f
parenta9370ddf1d55ca01307c086950d2294611d9e6a6 (diff)
Use uchardet to guess encoding of subtitle files and reject non-UTF-8.
-rw-r--r--src/lib/exceptions.h8
-rw-r--r--src/lib/subrip.cc23
-rw-r--r--src/tools/wscript1
-rw-r--r--test/wscript2
-rw-r--r--wscript2
5 files changed, 35 insertions, 1 deletions
diff --git a/src/lib/exceptions.h b/src/lib/exceptions.h
index 7240611ee..6939f81a3 100644
--- a/src/lib/exceptions.h
+++ b/src/lib/exceptions.h
@@ -263,4 +263,12 @@ public:
ProgrammingError (std::string file, int line);
};
+class TextEncodingError : public StringError
+{
+public:
+ TextEncodingError (std::string s)
+ : StringError (s)
+ {}
+};
+
#endif
diff --git a/src/lib/subrip.cc b/src/lib/subrip.cc
index f19867952..d4adee428 100644
--- a/src/lib/subrip.cc
+++ b/src/lib/subrip.cc
@@ -23,10 +23,14 @@
#include "subrip_content.h"
#include <sub/subrip_reader.h>
#include <sub/collect.h>
+#include <uchardet/uchardet.h>
+#include <iostream>
#include "i18n.h"
using std::vector;
+using std::cout;
+using std::string;
using boost::shared_ptr;
SubRip::SubRip (shared_ptr<const SubRipContent> content)
@@ -36,6 +40,25 @@ SubRip::SubRip (shared_ptr<const SubRipContent> content)
throw OpenFileError (content->path (0));
}
+ /* Guess the encoding */
+ uchardet_t det = uchardet_new ();
+ char buffer[1024];
+ while (!feof (f)) {
+ int const n = fread (buffer, 1, sizeof (buffer), f);
+ if (uchardet_handle_data (det, buffer, n)) {
+ break;
+ }
+ }
+
+ uchardet_data_end (det);
+ string charset = uchardet_get_charset (det);
+ uchardet_delete (det);
+
+ if (charset != "UTF-8") {
+ throw TextEncodingError (_("unrecognised character set; please use files encoded in UTF-8"));
+ }
+
+ rewind (f);
sub::SubripReader reader (f);
_subtitles = sub::collect<vector<sub::Subtitle> > (reader.subtitles ());
}
diff --git a/src/tools/wscript b/src/tools/wscript
index 33a631e6e..b01eee7ca 100644
--- a/src/tools/wscript
+++ b/src/tools/wscript
@@ -29,6 +29,7 @@ def configure(conf):
def build(bld):
uselib = 'BOOST_THREAD BOOST_DATETIME OPENJPEG DCP XMLSEC CXML XMLPP AVFORMAT AVFILTER AVCODEC '
uselib += 'AVUTIL SWSCALE POSTPROC CURL BOOST_FILESYSTEM SSH ZIP CAIROMM FONTCONFIG PANGOMM SUB MAGICK SNDFILE SAMPLERATE BOOST_REGEX '
+ uselib += 'UCHARDET '
if bld.env.TARGET_WINDOWS:
uselib += 'WINSOCK2'
diff --git a/test/wscript b/test/wscript
index 1a1038e8d..a92e344eb 100644
--- a/test/wscript
+++ b/test/wscript
@@ -31,7 +31,7 @@ def build(bld):
obj = bld(features='cxx cxxprogram')
obj.name = 'unit-tests'
obj.uselib = 'BOOST_TEST BOOST_THREAD BOOST_FILESYSTEM BOOST_DATETIME SNDFILE SAMPLERATE DCP OPENJPEG FONTCONFIG CAIROMM PANGOMM XMLPP '
- obj.uselib += 'AVFORMAT AVFILTER AVCODEC AVUTIL SWSCALE POSTPROC CXML MAGICK SUB GLIB CURL SSH XMLSEC BOOST_REGEX '
+ obj.uselib += 'AVFORMAT AVFILTER AVCODEC AVUTIL SWSCALE POSTPROC CXML MAGICK SUB GLIB CURL SSH XMLSEC BOOST_REGEX UCHARDET '
if bld.env.TARGET_WINDOWS:
obj.uselib += 'WINSOCK2'
obj.use = 'libdcpomatic2'
diff --git a/wscript b/wscript
index 3e84dcac5..150e9cb16 100644
--- a/wscript
+++ b/wscript
@@ -159,6 +159,8 @@ def configure(conf):
else:
conf.check_cfg(package='libcurl', args='--cflags --libs', uselib_store='CURL', mandatory=True)
+ # uchardet
+ conf.check_cfg(package='uchardet', args='--cflags --libs', uselib_store='UCHARDET', mandatory=True)
# libsndfile
conf.check_cfg(package='sndfile', args='--cflags --libs', uselib_store='SNDFILE', mandatory=True)