Skip to content

Commit

Permalink
When converting non english texts to english, use the users currect c…
Browse files Browse the repository at this point in the history
…alibre interface language. This allows japanes/korean/vietnamese characters to be correctly converted. Previously they were assumed to be Chinese. Fixes #7622 (Calibre need to switch logic when converting Unicode filename into ASCII)
  • Loading branch information
kovidgoyal committed Feb 14, 2011
2 parents 99fffae + 411adb4 commit c4f06e3
Show file tree
Hide file tree
Showing 27 changed files with 146,309 additions and 3,292 deletions.
27 changes: 27 additions & 0 deletions COPYRIGHT
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,33 @@ License: GPL-3
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.

Files: src/calibre/ebooks/unihandecode/pykakasi/*
Copyright: 2011, Hiroshi Miura <[email protected]>
Copyright: 1992, Hironobu Takahashi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.

Files: resources/kanwadict2.db
Files: resources/itaijidict2.pickle
Copyright: 2011, Hiroshi Miura <[email protected]>
Copyright: 1992 1993 1994, Hironobu Takahashi ([email protected]),
Copyright: 1992 1993 1994, Masahiko Sato ([email protected]),
Copyright: 1992 1993 1994, Yukiyoshi Kameyama, Miki Inooka, Akihiko Sasaki, Dai Ando, Junichi Okukawa,
Copyright: 1992 1993 1994, Katsushi Sato and Nobuhiro Yamagishi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.

Files: src/calibre/ebooks/unihandecode/*
Copyright: 2010-2011, Hiroshi Miura <[email protected]>
Copyright: 2009, John Schember
Copyright: 2007, Russell Norris
Copyright: 2001, Sean M. Burke
License: GPL-3, Perl
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.

Files: src/encutils/__init__.py
Copyright: 2005-2008: Christof Hoeke
License: LGPL-3+, CC-BY-3.0
Expand Down
110 changes: 108 additions & 2 deletions setup/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
__copyright__ = '2009, Kovid Goyal <[email protected]>'
__docformat__ = 'restructuredtext en'

import os, cPickle
import os, cPickle, re, anydbm, shutil
from zlib import compress

from setup import Command, basenames
from setup import Command, basenames, __appname__

def get_opts_from_parser(parser):
def do_opt(opt):
Expand All @@ -26,6 +27,9 @@ class Resources(Command):

description = 'Compile various needed calibre resources'

KAKASI_PATH = os.path.join(Command.SRC, __appname__,
'ebooks', 'unihandecode', 'pykakasi')

def run(self, opts):
scripts = {}
for x in ('console', 'gui'):
Expand Down Expand Up @@ -101,11 +105,113 @@ def run(self, opts):
import json
json.dump(function_dict, open(dest, 'wb'), indent=4)

self.run_kakasi(opts)

def run_kakasi(self, opts):
self.records = {}
src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanwadict2.db')
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)

if not self.newer(dest, src):
self.info('\tKanwadict is up to date')
else:
self.info('\tGenerating Kanwadict')

for line in open(src, "r"):
self.parsekdict(line)
self.kanwaout(dest)

src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','itaijidict2.pickle')

if not self.newer(dest, src):
self.info('\tItaijidict is up to date')
else:
self.info('\tGenerating Itaijidict')
self.mkitaiji(src, dest)

src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanadict2.pickle')

if not self.newer(dest, src):
self.info('\tKanadict is up to date')
else:
self.info('\tGenerating kanadict')
self.mkkanadict(src, dest)

return


def mkitaiji(self, src, dst):
dic = {}
for line in open(src, "r"):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:unichr(int(x.group(1),16)), line)
dic[pair[0]] = pair[1]
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle

def mkkanadict(self, src, dst):
dic = {}
for line in open(src, "r"):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
(alpha, kana) = line.split(' ')
dic[kana] = alpha
cPickle.dump(dic, open(dst, 'w'), protocol=-1) #pickle

def parsekdict(self, line):
line = line.decode("utf-8").strip()
if line.startswith(';;'): # skip comment
return
(yomi, kanji) = line.split(' ')
if ord(yomi[-1:]) <= ord('z'):
tail = yomi[-1:]
yomi = yomi[:-1]
else:
tail = ''
self.updaterec(kanji, yomi, tail)

def updaterec(self, kanji, yomi, tail):
key = "%04x"%ord(kanji[0])
if key in self.records:
if kanji in self.records[key]:
rec = self.records[key][kanji]
rec.append((yomi,tail))
self.records[key].update( {kanji: rec} )
else:
self.records[key][kanji]=[(yomi, tail)]
else:
self.records[key] = {}
self.records[key][kanji]=[(yomi, tail)]

def kanwaout(self, out):
dic = anydbm.open(out, 'c')
for (k, v) in self.records.iteritems():
dic[k] = compress(cPickle.dumps(v, -1))
dic.close()


def clean(self):
for x in ('scripts', 'recipes', 'ebook-convert-complete'):
x = self.j(self.RESOURCES, x+'.pickle')
if os.path.exists(x):
os.remove(x)
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
if os.path.exists(kakasi):
shutil.rmtree(kakasi)



Expand Down
4 changes: 2 additions & 2 deletions src/calibre/ebooks/conversion/plumber.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,8 @@ def __init__(self, input, output, log, report_progress=DummyReporter(),
'with "Mikhail Gorbachiov". Also, note that in '
'cases where there are multiple representations of a character '
'(characters shared by Chinese and Japanese for instance) the '
'representation used by the largest number of people will be '
'used (Chinese in the previous example).')%\
'representation based on the current calibre interface language will be '
'used.')%\
u'\u041c\u0438\u0445\u0430\u0438\u043b '
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
)
Expand Down
12 changes: 6 additions & 6 deletions src/calibre/ebooks/conversion/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,9 +543,9 @@ def dump(raw, where):
html = XMLDECL_RE.sub('', html)

if getattr(self.extra_opts, 'asciiize', False):
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
html = unidecoder.decode(html)
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
html = unihandecoder.decode(html)

if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
Expand All @@ -557,10 +557,10 @@ def dump(raw, where):

unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char)
asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar)

return html
Expand Down
Empty file.
Loading

0 comments on commit c4f06e3

Please sign in to comment.