Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle invalid xml characters in text exporters and fix #655 #675

Merged
merged 1 commit into from
Aug 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
/// Alto 4.1 (XML) text exporter.
/// <para>See https://github.com/altoxml/schema </para>
/// </summary>
public class AltoXmlTextExporter : ITextExporter
public sealed class AltoXmlTextExporter : ITextExporter
{
private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;

private readonly Func<string, string> invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;

Expand All @@ -33,20 +33,60 @@ public class AltoXmlTextExporter : ITextExporter
private int stringCount;
private int glyphCount;

/// <inheritdoc/>
public InvalidCharStrategy InvalidCharStrategy { get; }

/// <summary>
/// Alto 4.1 (XML).
/// <para>See https://github.com/altoxml/schema </para>
/// </summary>
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indent">Character to use for indentation, defaults to tab.</param>
public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1, string indent = "\t")
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterHandler">How to handle invalid characters.</param>
public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
Func<string, string> invalidCharacterHandler)
: this(wordExtractor, pageSegmenter, scale, indentChar,
InvalidCharStrategy.Custom, invalidCharacterHandler)
{ }

/// <summary>
/// Alto 4.1 (XML).
/// <para>See https://github.com/altoxml/schema </para>
/// </summary>
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterStrategy">How to handle invalid characters.</param>
public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale = 1, string indentChar = "\t",
InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
: this(wordExtractor, pageSegmenter, scale, indentChar,
invalidCharacterStrategy, null)
{ }

private AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
InvalidCharStrategy invalidCharacterStrategy,
Func<string, string> invalidCharacterHandler)
{
this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor));
this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter));
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.scale = scale;
indentChar = indent ?? string.Empty;
this.indentChar = indentChar ?? string.Empty;
InvalidCharStrategy = invalidCharacterStrategy;

if (invalidCharacterHandler is null)
{
this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
}
else
{
this.invalidCharacterHandler = invalidCharacterHandler;
}
}

/// <summary>
Expand All @@ -57,10 +97,7 @@ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegm
public string Get(PdfDocument document, bool includePaths = false)
{
var altoDocument = CreateAltoDocument("unknown");
var altoPages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();

altoDocument.Layout.Pages = altoPages;

altoDocument.Layout.Pages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();
return Serialize(altoDocument);
}

Expand Down Expand Up @@ -128,8 +165,8 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
{
Height = (float)Math.Round(page.Height * scale), // TBD
Width = (float)Math.Round(page.Width * scale), // TBD
VerticalPosition = 0f, // TBD
HorizontalPosition = 0f, // TBD
VerticalPosition = 0f, // TBD
HorizontalPosition = 0f, // TBD
ComposedBlocks = null, // TBD
GraphicalElements = null, // TBD
Illustrations = null, // TBD
Expand All @@ -141,9 +178,7 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
};

var words = page.GetWords(wordExtractor);
var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();

altoPage.PrintSpace.TextBlock = blocks;
altoPage.PrintSpace.TextBlock = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();

altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();

Expand Down Expand Up @@ -222,7 +257,6 @@ private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, dou
{
textLineCount++;
var strings = textLine.Words
.Where(x => x.Text.All(XmlConvert.IsXmlChar))
.Select(w => ToAltoString(w, height)).ToArray();

return new AltoDocument.AltoTextBlockTextLine
Expand Down Expand Up @@ -252,7 +286,7 @@ private AltoDocument.AltoString ToAltoString(Word word, double height)
Width = (float)Math.Round(word.BoundingBox.Width * scale),
Glyph = glyphs,
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
Content = word.Text,
Content = invalidCharacterHandler(word.Text),
Language = null,
StyleRefs = null,
SubsContent = null,
Expand All @@ -272,7 +306,7 @@ private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, double height)
Height = (float)Math.Round(letter.GlyphRectangle.Height * scale),
Width = (float)Math.Round(letter.GlyphRectangle.Width * scale),
Gc = 1.0f,
Content = letter.Value,
Content = invalidCharacterHandler(letter.Value),
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") + "_G" + glyphCount.ToString("#00")
};
}
Expand Down Expand Up @@ -314,8 +348,8 @@ private AltoDocument.AltoDescription GetAltoDescription(string fileName)
Processings = new[] { processing },
SourceImageInformation = new AltoDocument.AltoSourceImageInformation
{
DocumentIdentifiers = new [] { documentIdentifier },
FileIdentifiers = new [] { fileIdentifier },
DocumentIdentifiers = new[] { documentIdentifier },
FileIdentifiers = new[] { fileIdentifier },
FileName = fileName
}
};
Expand All @@ -329,6 +363,7 @@ private string Serialize(AltoDocument altoDocument)
Encoding = System.Text.Encoding.UTF8,
Indent = true,
IndentChars = indentChar,
CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck,
};

using (var memoryStream = new System.IO.MemoryStream())
Expand All @@ -346,7 +381,12 @@ public static AltoDocument Deserialize(string xmlPath)
{
var serializer = new XmlSerializer(typeof(AltoDocument));

using (var reader = XmlReader.Create(xmlPath))
var settings = new XmlReaderSettings()
{
CheckCharacters = false
};

using (var reader = XmlReader.Create(xmlPath, settings))
{
return (AltoDocument)serializer.Deserialize(reader);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
/// hOCR v1.2 (HTML) text exporter.
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
public class HOcrTextExporter : ITextExporter
public sealed class HOcrTextExporter : ITextExporter
{
private const string XmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
private const string Hocrjs = "<script src='https://unpkg.com/hocrjs'></script>\n";

private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;

private readonly Func<string, string> invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;

Expand All @@ -32,16 +32,60 @@ public class HOcrTextExporter : ITextExporter
private int paraCount;
private int imageCount;

/// <inheritdoc/>
public InvalidCharStrategy InvalidCharStrategy { get; }

/// <summary>
/// hOCR v1.2 (HTML)
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterHandler">How to handle invalid characters.</param>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
Func<string, string> invalidCharacterHandler)
: this(wordExtractor, pageSegmenter, scale, indentChar,
InvalidCharStrategy.Custom, invalidCharacterHandler)
{ }

/// <summary>
/// hOCR v1.2 (HTML)
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterStrategy">How to handle invalid characters.</param>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale = 1, string indentChar = "\t",
InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
: this(wordExtractor, pageSegmenter, scale, indentChar,
invalidCharacterStrategy, null)
{ }

private HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
InvalidCharStrategy invalidCharacterStrategy,
Func<string, string> invalidCharacterHandler)
{
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.scale = scale;
indentChar = indent;
this.indentChar = indentChar ?? string.Empty;
InvalidCharStrategy = invalidCharacterStrategy;

if (invalidCharacterHandler is null)
{
this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
}
else
{
this.invalidCharacterHandler = invalidCharacterHandler;
}
}

/// <summary>
Expand Down Expand Up @@ -325,7 +369,7 @@ private string GetCode(Word word, double pageHeight, int level)
}
hocr += "'";

hocr += ">" + word.Text + "</span> ";
hocr += ">" + invalidCharacterHandler(word.Text) + "</span> ";
return hocr;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
{
/// <summary>
/// How to handle invalid characters.
/// </summary>
public enum InvalidCharStrategy : byte
{
/// <summary>
/// Custom strategy.
/// </summary>
Custom = 0,

/// <summary>
/// Do not check invalid character.
/// </summary>
DoNotCheck = 1,

/// <summary>
/// Remove invalid character.
/// </summary>
Remove = 2,

/// <summary>
/// Convert invalid character to hexadecimal representation.
/// </summary>
ConvertToHexadecimal = 3
}
}
Loading
Loading