You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have registered a custom text extractor (PdfPig).
However it doesn't hit any of my break points, and it doesn't seem to return any results.
I have registered it as below:
using Examine;
using Examine.LuceneEngine.Providers;
using Umbraco.Core;
using Umbraco.Core.Composing;
using UmbracoExamine.PDF;
using UmbracoExaminePDF.Extractors;
namespace UmbracoExaminePDF.Composers
{
[ComposeAfter(typeof(ExaminePdfComposer))] //this must execute after the ExaminePdfComposer composer
public class ExaminePdfComposer : ComponentComposer<ExaminePdfComponent>, IUserComposer
{
public override void Compose(Composition composition)
{
composition.RegisterUnique<IPdfTextExtractor, PdfPigTextExtractor>();
}
}
public class ExaminePdfComponent : IComponent
{
private readonly IExamineManager _examineManager;
public ExaminePdfComponent(IExamineManager examineManager)
{
_examineManager = examineManager;
}
public void Initialize()
{
//Get both the external and pdf index
if (_examineManager.TryGetIndex(Constants.UmbracoIndexes.ExternalIndexName, out var externalIndex)
&& _examineManager.TryGetIndex(PdfIndexConstants.PdfIndexName, out var pdfIndex))
{
//register a multi searcher for both of them
var multiSearcher = new MultiIndexSearcher("MultiSearcher", new IIndex[] { externalIndex, pdfIndex });
_examineManager.AddSearcher(multiSearcher);
}
}
public void Terminate() { }
}
}
And the Pdf pig extractor is pretty simple:
using System.IO;
using System.Text;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UmbracoExamine.PDF;
namespace UmbracoExaminePDF.Extractors
{
/// <summary>
/// Extracts text from a PDF using PdfPig
/// https://github.com/UglyToad/PdfPig
/// </summary>
public class PdfPigTextExtractor : IPdfTextExtractor
{
public string GetTextFromPdf(Stream pdfFileStream)
{
using (PdfDocument document = PdfDocument.Open(pdfFileStream))
{
var result = new StringBuilder();
foreach (Page page in document.GetPages())
{
result.AppendLine(page.Text);
}
return result.ToString();
}
}
}
}
Any help would be appreciated
The text was updated successfully, but these errors were encountered:
I'll take a look at this and get back to you. PDFPig looks really promising, I have been putting a ton of effort into adding text extraction to PDFSharp, and this seems to do a pretty decent job out of the box, and it's Apache 2.0 licensed.
This works for us. We ReqisterUnique in a composer that ComposeAfters ExaminePdfComposer, and our extractor runs. Code is trivial, but let me know if needed.
I have registered a custom text extractor (PdfPig).
However it doesn't hit any of my break points, and it doesn't seem to return any results.
I have registered it as below:
And the Pdf pig extractor is pretty simple:
Any help would be appreciated
The text was updated successfully, but these errors were encountered: