When converting non english texts to english, use the users currect c…

…alibre interface language. This allows japanes/korean/vietnamese characters to be correctly converted. Previously they were assumed to be Chinese. Fixes #7622 (Calibre need to switch logic when converting Unicode filename into ASCII)
Q-Qaysaneah · Feb 14, 2011 · c4f06e3 · c4f06e3
2 parents 99fffae + 411adb4
commit c4f06e3
Show file tree

Hide file tree

Showing 27 changed files with 146,309 additions and 3,292 deletions.
diff --git a/COPYRIGHT b/COPYRIGHT
@@ -193,6 +193,33 @@ License: GPL-3
  The full text of the GPL is distributed as in
  /usr/share/common-licenses/GPL-3 on Debian systems.
 
+Files: src/calibre/ebooks/unihandecode/pykakasi/*
+Copyright: 2011, Hiroshi Miura <[email protected]>
+Copyright: 1992, Hironobu Takahashi
+License: GPL-2+
+ The full text of the GPL is distributed as in
+ /usr/share/common-licenses/GPL on Debian systems.
+
+Files: resources/kanwadict2.db
+Files: resources/itaijidict2.pickle
+Copyright: 2011, Hiroshi Miura <[email protected]>
+Copyright: 1992 1993 1994, Hironobu Takahashi ([email protected]),
+Copyright: 1992 1993 1994, Masahiko Sato ([email protected]),
+Copyright: 1992 1993 1994, Yukiyoshi Kameyama, Miki Inooka, Akihiko Sasaki, Dai Ando, Junichi Okukawa,
+Copyright: 1992 1993 1994, Katsushi Sato and Nobuhiro Yamagishi
+License: GPL-2+
+ The full text of the GPL is distributed as in
+ /usr/share/common-licenses/GPL on Debian systems.
+
+Files: src/calibre/ebooks/unihandecode/*
+Copyright: 2010-2011, Hiroshi Miura <[email protected]>
+Copyright: 2009, John Schember
+Copyright: 2007, Russell Norris
+Copyright: 2001, Sean M. Burke
+License: GPL-3, Perl
+ The full text of the GPL is distributed as in
+ /usr/share/common-licenses/GPL-3 on Debian systems.
+
 Files: src/encutils/__init__.py
 Copyright: 2005-2008: Christof Hoeke
 License: LGPL-3+, CC-BY-3.0

diff --git a/setup/resources.py b/setup/resources.py
@@ -6,9 +6,10 @@
 __copyright__ = '2009, Kovid Goyal <[email protected]>'
 __docformat__ = 'restructuredtext en'
 
-import os, cPickle
+import os, cPickle, re, anydbm, shutil
+from zlib import compress
 
-from setup import Command, basenames
+from setup import Command, basenames, __appname__
 
 def get_opts_from_parser(parser):
     def do_opt(opt):
@@ -26,6 +27,9 @@ class Resources(Command):
 
     description = 'Compile various needed calibre resources'
 
+    KAKASI_PATH = os.path.join(Command.SRC,  __appname__,
+            'ebooks', 'unihandecode', 'pykakasi')
+
     def run(self, opts):
         scripts = {}
         for x in ('console', 'gui'):
@@ -101,11 +105,113 @@ def run(self, opts):
         import json
         json.dump(function_dict, open(dest, 'wb'), indent=4)
 
+        self.run_kakasi(opts)
+
+    def run_kakasi(self, opts):
+        self.records = {}
+        src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
+        dest = self.j(self.RESOURCES, 'localization',
+                'pykakasi','kanwadict2.db')
+        base = os.path.dirname(dest)
+        if not os.path.exists(base):
+            os.makedirs(base)
+
+        if not self.newer(dest, src):
+            self.info('\tKanwadict is up to date')
+        else:
+            self.info('\tGenerating Kanwadict')
+
+            for line in open(src, "r"):
+                self.parsekdict(line)
+            self.kanwaout(dest)
+
+        src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
+        dest = self.j(self.RESOURCES, 'localization',
+                'pykakasi','itaijidict2.pickle')
+
+        if not self.newer(dest, src):
+            self.info('\tItaijidict is up to date')
+        else:
+            self.info('\tGenerating Itaijidict')
+            self.mkitaiji(src, dest)
+
+        src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
+        dest = self.j(self.RESOURCES, 'localization',
+                'pykakasi','kanadict2.pickle')
+
+        if not self.newer(dest, src):
+            self.info('\tKanadict is up to date')
+        else:
+            self.info('\tGenerating kanadict')
+            self.mkkanadict(src, dest)
+
+        return
+
+
+    def mkitaiji(self, src, dst):
+        dic = {}
+        for line in open(src, "r"):
+            line = line.decode("utf-8").strip()
+            if line.startswith(';;'): # skip comment
+                continue
+            if re.match(r"^$",line):
+                continue
+            pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:unichr(int(x.group(1),16)), line)
+            dic[pair[0]] = pair[1]
+        cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
+
+    def mkkanadict(self, src, dst):
+        dic = {}
+        for line in open(src, "r"):
+            line = line.decode("utf-8").strip()
+            if line.startswith(';;'): # skip comment
+                continue
+            if re.match(r"^$",line):
+                continue
+            (alpha, kana) = line.split(' ')
+            dic[kana] = alpha
+        cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle
+
+    def parsekdict(self, line):
+        line = line.decode("utf-8").strip()
+        if line.startswith(';;'): # skip comment
+            return
+        (yomi, kanji) = line.split(' ')
+        if ord(yomi[-1:]) <= ord('z'):
+            tail = yomi[-1:]
+            yomi = yomi[:-1]
+        else:
+            tail = ''
+        self.updaterec(kanji, yomi, tail)
+
+    def updaterec(self, kanji, yomi, tail):
+            key = "%04x"%ord(kanji[0])
+            if key in self.records:
+                if kanji in self.records[key]:
+                    rec = self.records[key][kanji]
+                    rec.append((yomi,tail))
+                    self.records[key].update( {kanji: rec} )
+                else:
+                    self.records[key][kanji]=[(yomi, tail)]
+            else:
+                self.records[key] = {}
+                self.records[key][kanji]=[(yomi, tail)]
+
+    def kanwaout(self, out):
+        dic = anydbm.open(out, 'c')
+        for (k, v) in self.records.iteritems():
+            dic[k] = compress(cPickle.dumps(v, -1))
+        dic.close()
+
+
     def clean(self):
         for x in ('scripts', 'recipes', 'ebook-convert-complete'):
             x = self.j(self.RESOURCES, x+'.pickle')
             if os.path.exists(x):
                 os.remove(x)
+        kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
+        if os.path.exists(kakasi):
+            shutil.rmtree(kakasi)
 
 
 

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
@@ -402,8 +402,8 @@ def __init__(self, input, output, log, report_progress=DummyReporter(),
             'with "Mikhail Gorbachiov". Also, note that in '
             'cases where there are multiple representations of a character '
             '(characters shared by Chinese and Japanese for instance) the '
-            'representation used by the largest number of people will be '
-            'used (Chinese in the previous example).')%\
+            'representation based on the current calibre interface language will be '
+            'used.')%\
             u'\u041c\u0438\u0445\u0430\u0438\u043b '
             u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
 )

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
@@ -543,9 +543,9 @@ def dump(raw, where):
         html = XMLDECL_RE.sub('', html)
 
         if getattr(self.extra_opts, 'asciiize', False):
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
-            html = unidecoder.decode(html)
+            from calibre.utils.localization import get_udc
+            unihandecoder = get_udc()
+            html = unihandecoder.decode(html)
 
         if getattr(self.extra_opts, 'enable_heuristics', False):
             from calibre.ebooks.conversion.utils import HeuristicProcessor
@@ -557,10 +557,10 @@ def dump(raw, where):
 
         unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
         if unsupported_unicode_chars:
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
+            from calibre.utils.localization import get_udc
+            unihandecoder = get_udc()
             for char in unsupported_unicode_chars:
-                asciichar = unidecoder.decode(char)
+                asciichar = unihandecoder.decode(char)
                 html = html.replace(char, asciichar)
 
         return html

diff --git a/src/calibre/ebooks/unidecode/__init__.py b/src/calibre/ebooks/unidecode/__init__.py