diff options
| author | Carl Hetherington <cth@carlh.net> | 2015-09-25 12:45:48 +0100 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2015-09-25 12:45:48 +0100 |
| commit | effc88be7dcf3e0848ed9dab8010e8c20cf4bb38 (patch) | |
| tree | 2e1571225d0835176077054da8a4055d6a9f869b | |
| parent | 86b6bee8957d6ec010d235f1ae83f1ec33a646c1 (diff) | |
Use libicu instead of uchardet and convert subrip files to UTF-8.
| -rw-r--r-- | ChangeLog | 5 | ||||
| -rw-r--r-- | cscript | 12 | ||||
| -rw-r--r-- | src/lib/subrip.cc | 46 | ||||
| -rw-r--r-- | wscript | 3 |
4 files changed, 35 insertions, 31 deletions
@@ -1,3 +1,8 @@ +2015-09-25 Carl Hetherington <cth@carlh.net> + + * Detect and convert from non-UTF-8 + subtitle encodings. + 2015-09-21 Carl Hetherington <cth@carlh.net> * Version 2.3.5 released. @@ -46,7 +46,7 @@ deb_depends['12.04'] = {'libc6': '2.15', 'libcurl3': '7.22.0-3ubuntu4', 'libzip2': '0.10-1ubuntu1', 'libsamplerate0': '0.1.8-4', - 'libuchardet0': '0.0.1-1'} + 'libicu48': '4.8.1.1-3'} deb_depends['14.04'] = {'libc6': '2.19-0ubuntu6', 'libssh-4': '0.6.1-0ubuntu3', @@ -63,7 +63,7 @@ deb_depends['14.04'] = {'libc6': '2.19-0ubuntu6', 'libcurl3': '7.35.0-1ubuntu2', 'libzip2': '0.10.1-1.2', 'libsamplerate0': '0.1.8-7', - 'libuchardet0': '0.0.1-1ubuntu1'} + 'libicu52': '52.1-3'} deb_depends['15.04'] = {'libc6': '2.21-0ubuntu4', 'libssh-4': '0.6.3-3ubuntu3', @@ -81,7 +81,7 @@ deb_depends['15.04'] = {'libc6': '2.21-0ubuntu4', 'libzip2': '0.11.2-1.2', 'libwxgtk3.0-0': '3.0.2-1', 'libsamplerate0': '0.1.8-8', - 'libuchardet0': '0.0.1-1ubuntu1'} + 'libicu52': '52.1-8'} deb_depends['7'] = {'libc6': '2.13', 'libssh-4': '0.5.4', @@ -100,7 +100,7 @@ deb_depends['7'] = {'libc6': '2.13', 'libcairomm-1.0-1': '1.10.0-1', 'libpangomm-1.4-1': '2.28.4-1', 'libsamplerate0': '0.1.8-5', - 'libuchardet': '0.0.1-1'} + 'libicu48': '4.8.1.1-12+deb7u3'} deb_depends['8'] = {'libc6': '2.19-18', 'libssh-4': '0.6.3-4', @@ -120,7 +120,7 @@ deb_depends['8'] = {'libc6': '2.19-18', 'libxcb-xfixes0': '1.10', 'libxcb-shape0': '1.10', 'libsamplerate0': '0.1.8-8', - 'libuchardet': '0.0.1-1'} + 'libicu52': '52.1-8+deb8u2'} deb_depends['unstable'] = {'libc6': '2.13', 'libssh-4': '0.5.4', @@ -137,7 +137,7 @@ deb_depends['unstable'] = {'libc6': '2.13', 'libcurl3': '7.26.0', 'libzip2': '0.10.1', 'libsamplerate0': '0.1.8-8', - 'libuchardet': '0.0.1-1'} + 'libicu52': '52.1-9'} def packages(name, packages, f): s = '%s: ' % name diff --git a/src/lib/subrip.cc b/src/lib/subrip.cc index d4adee428..6df8b236b 100644 --- a/src/lib/subrip.cc +++ b/src/lib/subrip.cc @@ -21,9 +21,11 @@ #include "cross.h" #include "exceptions.h" #include "subrip_content.h" +#include "data.h" #include <sub/subrip_reader.h> #include <sub/collect.h> -#include <uchardet/uchardet.h> +#include <unicode/ucsdet.h> +#include <unicode/ucnv.h> #include <iostream> #include "i18n.h" @@ -32,34 +34,34 @@ using std::vector; using std::cout; using std::string; using boost::shared_ptr; +using boost::scoped_array; SubRip::SubRip (shared_ptr<const SubRipContent> content) { - FILE* f = fopen_boost (content->path (0), "r"); - if (!f) { - throw OpenFileError (content->path (0)); - } + Data in (content->path (0)); - /* Guess the encoding */ - uchardet_t det = uchardet_new (); - char buffer[1024]; - while (!feof (f)) { - int const n = fread (buffer, 1, sizeof (buffer), f); - if (uchardet_handle_data (det, buffer, n)) { - break; - } - } + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open (&status); + ucsdet_setText (detector, reinterpret_cast<const char *> (in.data().get()), in.size(), &status); - uchardet_data_end (det); - string charset = uchardet_get_charset (det); - uchardet_delete (det); + UCharsetMatch const * match = ucsdet_detect (detector, &status); + char const * in_charset = ucsdet_getName (match, &status); - if (charset != "UTF-8") { - throw TextEncodingError (_("unrecognised character set; please use files encoded in UTF-8")); - } + UConverter* to_utf16 = ucnv_open (in_charset, &status); + /* This is a guess; I think we should be able to encode any input in 4 times its input size */ + scoped_array<uint16_t> utf16 (new uint16_t[in.size() * 2]); + int const utf16_len = ucnv_toUChars (to_utf16, utf16.get(), in.size() * 2, reinterpret_cast<const char *> (in.data().get()), in.size(), &status); + + UConverter* to_utf8 = ucnv_open ("UTF-8", &status); + /* Another guess */ + scoped_array<char> utf8 (new char[utf16_len * 2]); + ucnv_fromUChars (to_utf8, utf8.get(), utf16_len * 2, utf16.get(), utf16_len, &status); + + ucsdet_close (detector); + ucnv_close (to_utf16); + ucnv_close (to_utf8); - rewind (f); - sub::SubripReader reader (f); + sub::SubripReader reader (utf8.get()); _subtitles = sub::collect<vector<sub::Subtitle> > (reader.subtitles ()); } @@ -159,9 +159,6 @@ def configure(conf): else: conf.check_cfg(package='libcurl', args='--cflags --libs', uselib_store='CURL', mandatory=True) - # uchardet - conf.check_cfg(package='uchardet', args='--cflags --libs', uselib_store='UCHARDET', mandatory=True) - # libsndfile conf.check_cfg(package='sndfile', args='--cflags --libs', uselib_store='SNDFILE', mandatory=True) |
