Skip to content

Commit

Permalink
PDF metadata: Do not crash when reading malformed PDF files
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed May 30, 2012
1 parent d805a93 commit e2148e8
Show file tree
Hide file tree
Showing 19 changed files with 86 additions and 2,552 deletions.
3 changes: 3 additions & 0 deletions session.vim
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
" Project wide builtins
let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"

" Include directories for C modules
let g:syntastic_c_include_dirs = [ '/usr/include/podofo']

fun! CalibreLog()
" Setup buffers to edit the calibre changelog and version info prior to
" making a release.
Expand Down
42 changes: 1 addition & 41 deletions setup/build_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
__copyright__ = '2009, Kovid Goyal <[email protected]>'
__docformat__ = 'restructuredtext en'

import os, socket, struct, subprocess, glob
import os, socket, struct, subprocess
from distutils.spawn import find_executable

from PyQt4 import pyqtconfig
Expand Down Expand Up @@ -84,7 +84,6 @@ def consolidate(envvar, default):
ft_libs = []
jpg_libs = []
jpg_lib_dirs = []
poppler_objs = []
fc_inc = '/usr/include/fontconfig'
fc_lib = '/usr/lib'
podofo_inc = '/usr/include/podofo'
Expand Down Expand Up @@ -114,12 +113,7 @@ def consolidate(envvar, default):
jpg_libs = ['jpeg']
ft_lib_dirs = [sw_lib_dir]
ft_libs = ['freetype']
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))

poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.7.6')]
magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')]
magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_']
Expand All @@ -128,13 +122,6 @@ def consolidate(envvar, default):
elif isosx:
fc_inc = '/sw/include/fontconfig'
fc_lib = '/sw/lib'
poppler = glob.glob('/sw/build/poppler-*')[-1]
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'{0}/poppler:{0}'.format(poppler))
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib')
poppler_libs = ['poppler']
popplerqt4_lib_dirs = poppler_lib_dirs
podofo_inc = '/sw/podofo'
podofo_lib = '/sw/lib'
magick_inc_dirs = consolidate('MAGICK_INC',
Expand All @@ -147,22 +134,15 @@ def consolidate(envvar, default):
png_libs = ['png12']
else:
# Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')

# Library directories
poppler_lib_dirs = popplerqt4_lib_dirs = pkgconfig_lib_dirs('poppler', 'POPPLER_LIB_DIR',
'/usr/lib')
png_lib_dirs = pkgconfig_lib_dirs('libpng', 'PNG_LIB_DIR', '/usr/lib')
magick_lib_dirs = pkgconfig_lib_dirs('MagickWand', 'MAGICK_LIB', '/usr/lib')

# Libraries
poppler_libs = pkgconfig_libs('poppler', '', '')
if not poppler_libs:
poppler_libs = ['poppler']
magick_libs = pkgconfig_libs('MagickWand', '', '')
if not magick_libs:
magick_libs = ['MagickWand', 'MagickCore']
Expand All @@ -176,26 +156,6 @@ def consolidate(envvar, default):
'Try setting the FC_INC_DIR and FC_LIB_DIR environment '
'variables.')


poppler_error = None
poppler_cflags = ['-DPNG_SKIP_SETJMP_CHECK'] if islinux else []
if not poppler_inc_dirs or not os.path.exists(
os.path.join(poppler_inc_dirs[0], 'OutputDev.h')):
poppler_error = \
('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and',
' POPPLER_LIB_DIR environment variables. calibre requires '
' the poppler XPDF headers. If your distro does not '
' include them you will have to re-compile poppler '
' by hand with --enable-xpdf-headers')
else:
lh = os.path.join(poppler_inc_dirs[0], 'Link.h')
if 'class AnnotLink' not in open(lh, 'rb').read():
poppler_cflags.append('-DPOPPLER_OLD_LINK_TYPE')
ph = os.path.join(poppler_inc_dirs[0], 'Page.h')
if 'getLinks(Catalog' in open(ph, 'rb').read():
poppler_cflags.append('-DPOPPLER_PRE_20')

magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')):
Expand Down
5 changes: 2 additions & 3 deletions setup/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

__all__ = [
'pot', 'translations', 'get_translations', 'iso639',
'build', 'build_pdf2xml', 'server',
'build', 'server',
'gui',
'develop', 'install',
'kakasi', 'coffee', 'resources',
Expand All @@ -31,9 +31,8 @@
get_translations = GetTranslations()
iso639 = ISO639()

from setup.extensions import Build, BuildPDF2XML
from setup.extensions import Build
build = Build()
build_pdf2xml = BuildPDF2XML()

from setup.server import Server
server = Server()
Expand Down
67 changes: 5 additions & 62 deletions setup/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,11 @@
from PyQt4.pyqtconfig import QtGuiModuleMakefile

from setup import Command, islinux, isbsd, isosx, SRC, iswindows
from setup.build_environment import (fc_inc, fc_lib, chmlib_inc_dirs,
fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc,
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE,
QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk,
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs,
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs,
jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
icu_lib_dirs, poppler_cflags)
from setup.build_environment import (fc_inc, fc_lib, chmlib_inc_dirs, fc_error,
podofo_inc, podofo_lib, podofo_error, pyqt, OSX_SDK, NMAKE, QMAKE,
msvc, MT, win_inc, win_lib, win_ddk, magick_inc_dirs, magick_lib_dirs,
magick_libs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
icu_lib_dirs)
MT
isunix = islinux or isosx or isbsd

Expand Down Expand Up @@ -51,7 +48,6 @@ def __init__(self, name, sources, **kwargs):

reflow_sources = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.cpp'))
reflow_headers = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.h'))
reflow_error = poppler_error if poppler_error else magick_error

pdfreflow_libs = []
if iswindows:
Expand Down Expand Up @@ -107,16 +103,6 @@ def __init__(self, name, sources, **kwargs):
inc_dirs=magick_inc_dirs
),

Extension('pdfreflow',
reflow_sources,
headers=reflow_headers,
libraries=poppler_libs+magick_libs+png_libs+ft_libs+jpg_libs+pdfreflow_libs,
lib_dirs=poppler_lib_dirs+magick_lib_dirs+png_lib_dirs+ft_lib_dirs+jpg_lib_dirs,
inc_dirs=poppler_inc_dirs+magick_inc_dirs+png_inc_dirs,
error=reflow_error,
cflags=poppler_cflags
),

Extension('lzx',
['calibre/utils/lzx/lzxmodule.c',
'calibre/utils/lzx/compressor.c',
Expand Down Expand Up @@ -445,48 +431,5 @@ def clean(self):
shutil.rmtree(build_dir)


class BuildPDF2XML(Command):

description = 'Build command line pdf2xml utility'

def run(self, opts):
dest = os.path.expanduser('~/bin/pdf2xml')
if iswindows:
dest = r'C:\cygwin\home\kovid\sw\bin\pdf2xml.exe'
odest = self.j(self.d(self.SRC), 'build', 'objects', 'pdf2xml')
if not os.path.exists(odest):
os.makedirs(odest)

objects = []
for src in reflow_sources:
if src.endswith('python.cpp'):
continue
obj = self.j(odest, self.b(src+('.obj' if iswindows else '.o')))
if self.newer(obj, [src]+reflow_headers):
cmd = [cxx, '-pthread', '-pedantic', '-ggdb', '-c', '-Wall', '-I/usr/include/poppler',
'-I/usr/include/ImageMagick',
'-DPDF2XML', '-o', obj, src]
if iswindows:
cmd = [cxx, '/c', '/MD', '/W3', '/EHsc', '/Zi', '/DPDF2XML']
cmd += ['-I'+x for x in poppler_inc_dirs+magick_inc_dirs]
cmd += ['/Fo'+obj, src]
self.info(*cmd)
self.check_call(cmd)
objects.append(obj)

if self.newer(dest, objects):
cmd = ['g++', '-ggdb', '-o', dest]+objects+['-lpoppler', '-lMagickWand',
'-lpng', '-lpthread']
if iswindows:
cmd = [msvc.linker] + '/INCREMENTAL:NO /DEBUG /NODEFAULTLIB:libcmt.lib'.split()
cmd += ['/LIBPATH:'+x for x in magick_lib_dirs+poppler_lib_dirs]
cmd += [x+'.lib' for x in
png_libs+magick_libs+poppler_libs+ft_libs+jpg_libs+pdfreflow_libs]
cmd += ['/OUT:'+dest] + objects
self.info(*cmd)
self.check_call(cmd)

self.info('Binary installed as', dest)



2 changes: 2 additions & 0 deletions setup/installer/linux/freeze2.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
MAGICK_PREFIX = '/usr'
binary_includes = [
'/usr/bin/pdftohtml',
'/usr/bin/pdfinfo',
'/usr/bin/pdftoppm',
'/usr/lib/libwmflite-0.2.so.7',
'/usr/lib/liblcms.so.1',
'/usr/lib/liblzma.so.0',
Expand Down
3 changes: 2 additions & 1 deletion setup/installer/osx/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,8 @@ def add_poppler(self):
info('\nAdding poppler')
for x in ('libpoppler.25.dylib',):
self.install_dylib(os.path.join(SW, 'lib', x))
self.install_dylib(os.path.join(SW, 'bin', 'pdftohtml'), False)
for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
self.install_dylib(os.path.join(SW, 'bin', x), False)

@flush
def add_libjpeg(self):
Expand Down
3 changes: 2 additions & 1 deletion setup/installer/windows/freeze.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ def ignore_lib(root, items):

print '\tAdding misc binary deps'
bindir = os.path.join(SW, 'bin')
shutil.copy2(os.path.join(bindir, 'pdftohtml.exe'), self.base)
for x in ('pdftohtml', 'pdfinfo', 'pdftoppm'):
shutil.copy2(os.path.join(bindir, x+'.exe'), self.base)
for pat in ('*.dll',):
for f in glob.glob(os.path.join(bindir, pat)):
ok = True
Expand Down
4 changes: 1 addition & 3 deletions setup/installer/windows/notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,7 @@ In Cmake: disable GTK, Qt, OPenjpeg, cpp, lcms, gtk_tests, qt_tests. Enable qt4,

NOTE: poppler must be built as a static library, unless you build the qt4 bindings

Now do the same for the pdftohtml project

cp poppler/*.h ~/sw/include/poppler && cp goo/*.h ~/sw/include/poppler/goo && cp splash/*.h ~/sw/include/poppler/splash && cp build/Release/poppler.lib ../../lib/ && cp build/utils/Release/pdftohtml.exe ../../bin/
cp build/utils/Release/*.exe ../../bin/

podofo
Expand Down
1 change: 0 additions & 1 deletion src/calibre/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def __init__(self):
'podofo',
'cPalmdoc',
'fontconfig',
'pdfreflow',
'progress_indicator',
'chmlib',
'chm_extra',
Expand Down
85 changes: 68 additions & 17 deletions src/calibre/ebooks/metadata/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,80 @@
'''Read meta information from PDF files'''

#import re
import os, subprocess, shutil
from functools import partial

from calibre import prints
from calibre.constants import plugins
from calibre.constants import iswindows
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation, string_to_authors

pdfreflow, pdfreflow_error = plugins['pdfreflow']
from calibre.utils.ipc.simple_worker import fork_job

#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')

def read_info(outputdir, get_cover):
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
Note that this function changes the cwd to outputdir and is therefore not
thread safe. Run it using fork_job. This is necessary as there is no safe
way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.'''

from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
os.chdir(outputdir)
base = os.path.dirname(PDFTOHTML)
suffix = '.exe' if iswindows else ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
pdftoppm = os.path.join(base, 'pdftoppm') + suffix

try:
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', 'src.pdf'])
except subprocess.CalledProcessError as e:
prints('pdfinfo errored out with return code: %d'%e.returncode)
return None
try:
raw = raw.decode('utf-8')
except UnicodeDecodeError:
prints('pdfinfo returned no UTF-8 data')
return None

ans = {}
for line in raw.splitlines():
if u':' not in line: continue
field, val = line.partition(u':')[::2]
val = val.strip()
if field and val:
ans[field] = val.strip()

if get_cover:
try:
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'src.pdf', 'cover'])
except subprocess.CalledProcessError as e:
prints('pdftoppm errored out with return code: %d'%e.returncode)

return ans

def get_metadata(stream, cover=True):
if pdfreflow is None:
raise RuntimeError(pdfreflow_error)
stream.seek(0)
raw = stream.read()
#isbn = _isbn_pat.search(raw)
#if isbn is not None:
# isbn = isbn.group(1).replace('-', '').replace(' ', '')
info = pdfreflow.get_metadata(raw, cover)
with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
stream.seek(0)
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
shutil.copyfileobj(stream, f)
res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath,
bool(cover)))
info = res['result']
with open(res['stdout_stderr'], 'rb') as f:
raw = f.read().strip()
if raw:
prints(raw)
if not info:
raise ValueError('Could not read info dict from PDF')
covpath = os.path.join(pdfpath, 'cover.jpg')
cdata = None
if cover and os.path.exists(covpath):
with open(covpath, 'rb') as f:
cdata = f.read()

title = info.get('Title', None)
au = info.get('Author', None)
if au is None:
Expand All @@ -46,12 +101,8 @@ def get_metadata(stream, cover=True):
if subject:
mi.tags.insert(0, subject)

if cover and 'cover' in info:
data = info['cover']
if data is None:
prints(title, 'has no pages, cover extraction impossible.')
else:
mi.cover_data = ('png', data)
if cdata:
mi.cover_data = ('jpeg', cdata)

return mi

Expand Down
Loading

0 comments on commit e2148e8

Please sign in to comment.