Skip to content

Commit

Permalink
Address #672 to ignore errors while reading the descriptor file in Ci…
Browse files Browse the repository at this point in the history
…dFontFactory
  • Loading branch information
BobLd committed Aug 5, 2023
1 parent 8a82500 commit 9aaf20c
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 30 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
74 changes: 74 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/OpenTypeFontTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using Xunit;

namespace UglyToad.PdfPig.Tests.Integration
{
public class OpenTypeFontTests
{
[Fact]
public void Issue672()
{
// NB: The issue is actually not fully fixed: the change are just allowing
// to parse the document and get the text without error
// but the embedded font data is not properly parsed.
// It seems the font bytes are incorrectly parsed using the TrueTypeFontParser
// and are actually parsable with CompactFontFormatParser, but with some errors though.

using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Why.does.this.not.work")))
{
var page = document.GetPage(1);

var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);

var lines = DocstrumBoundingBoxes.Instance.GetBlocks(words).SelectMany(b => b.TextLines).ToArray();

Assert.Equal(3, lines.Length);

Assert.Equal("THIS TEST SEEMS TO BREAK THE PARSER....", lines[0].Text);
Assert.Equal("This is just some test text.", lines[1].Text);
Assert.Equal("SO DOES THIS", lines[2].Text);
}
}

[Fact]
public void Issue672ok()
{
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Test.Doc")))
{
var page = document.GetPage(1);

var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);

var lines = DocstrumBoundingBoxes.Instance.GetBlocks(words).SelectMany(b => b.TextLines).ToArray();

Assert.Equal(4, lines.Length);

Assert.Equal("This is just a bunch of boring text...", lines[0].Text);
Assert.Equal("THIS IS SOME SEMPLICITA PRO FONT", lines[1].Text);
Assert.Equal("Hopefully font that are not embedded on the server.", lines[2].Text);
Assert.Equal("And a bit of Verdana for good measure.", lines[3].Text);
}
}

[Fact]
public void So74165171()
{
// https://stackoverflow.com/questions/74165171/embedded-opentype-cff-font-in-a-pdf-shows-strange-behaviour-in-some-viewers

// Adding this test case as the OpenType font is correctly parsed using TrueTypeFontParser
// It seems there are further issues with the extracted test (also the case in Acrobat Reader).
// Out of scope for the moment

using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("test-2_so_74165171")))
{
var page = document.GetPage(1);

var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToArray();

Assert.Equal(2, words.Length);
}
}
}
}
6 changes: 5 additions & 1 deletion src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,11 @@ private static PdfDocument OpenDocument(

pdfScanner.UpdateEncryptionHandler(encryptionHandler);

var cidFontFactory = new CidFontFactory(pdfScanner, filterProvider);
var cidFontFactory = new CidFontFactory(
parsingOptions.Logger,
pdfScanner,
filterProvider);

var encodingReader = new EncodingReader(pdfScanner);

var type0Handler = new Type0FontHandler(
Expand Down
46 changes: 17 additions & 29 deletions src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,24 @@
using PdfPig.Parser.Parts;
using Tokenization.Scanner;
using Tokens;
using UglyToad.PdfPig.Logging;
using Util;

internal class CidFontFactory
{
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner pdfScanner;
private readonly ILog logger;

public CidFontFactory(IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider)
public CidFontFactory(ILog log, IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider)
{
this.logger = log;
this.pdfScanner = pdfScanner;
this.filterProvider = filterProvider;
}

public ICidFont Generate(DictionaryToken dictionary)
{
{
var type = dictionary.GetNameOrDefault(NameToken.Type);
if (!NameToken.Font.Equals(type))
{
Expand All @@ -50,7 +53,15 @@ public ICidFont Generate(DictionaryToken dictionary)
descriptor = FontDescriptorFactory.Generate(descriptorDictionary, pdfScanner);
}

var fontProgram = ReadDescriptorFile(descriptor);
ICidFontProgram fontProgram = null;
try
{
fontProgram = ReadDescriptorFile(descriptor);
}
catch (Exception ex)
{
logger.Error($"Invalid descriptor in CID font named '{descriptor?.FontName}': {ex.Message}.");
}

var baseFont = dictionary.GetNameOrDefault(NameToken.BaseFont);

Expand All @@ -74,25 +85,7 @@ public ICidFont Generate(DictionaryToken dictionary)

private bool TryGetFontDescriptor(DictionaryToken dictionary, out DictionaryToken descriptorDictionary)
{
descriptorDictionary = null;

if (!dictionary.TryGet(NameToken.FontDescriptor, out var baseValue))
{
return false;
}

try
{
var descriptor = DirectObjectFinder.Get<DictionaryToken>(baseValue, pdfScanner);

descriptorDictionary = descriptor;
}
catch
{
return false;
}

return true;
return dictionary.TryGet(NameToken.FontDescriptor, pdfScanner, out descriptorDictionary);
}

private ICidFontProgram ReadDescriptorFile(FontDescriptor descriptor)
Expand Down Expand Up @@ -267,14 +260,9 @@ private CharacterIdentifierSystemInfo GetSystemInfo(DictionaryToken dictionary)
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
}

if (cidEntry is DictionaryToken cidDictionary)
{

}
else
if (!(cidEntry is DictionaryToken cidDictionary))
{
cidDictionary =
DirectObjectFinder.Get<DictionaryToken>(cidEntry, pdfScanner);
cidDictionary = DirectObjectFinder.Get<DictionaryToken>(cidEntry, pdfScanner);
}

var registry = SafeKeyAccess(cidDictionary, NameToken.Registry);
Expand Down

0 comments on commit 9aaf20c

Please sign in to comment.