Skip to content

Commit

Permalink
Updated OCR service with language support
Browse files Browse the repository at this point in the history
Significant changes include:
- Added support for BCP-47 language codes in the OCR service
- Introduced a new OcrOptions record to encapsulate options for the OCR process, including language and accuracy preference
- Updated RecognizeTextAsync method to accept an instance of OcrOptions
- Implemented retrieval of supported languages in each platform-specific implementation
- Adjusted element properties order in OcrResult class for better readability
- Enhanced error handling by throwing exceptions when unsupported languages are used
  • Loading branch information
kfrancis committed Apr 9, 2024
1 parent 64d2e28 commit 00f4e19
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 74 deletions.
2 changes: 1 addition & 1 deletion samples/Plugin.Maui.Feature.Sample/MainPage.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,6 @@ private async Task<OcrResult> ProcessPhoto(FileResult photo)
await sourceStream.ReadAsync(imageData, cancellationTokenSource.Token);

// Process the image data using the OCR service
return await _ocr.RecognizeTextAsync(imageData, TryHardSwitch.IsToggled, cancellationTokenSource.Token);
return await _ocr.RecognizeTextAsync(imageData, new OcrOptions("pt-BR", TryHardSwitch.IsToggled), cancellationTokenSource.Token);
}
}
45 changes: 35 additions & 10 deletions src/Plugin.Maui.OCR/Abstractions/IOcrService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ namespace Plugin.Maui.OCR;
/// </summary>
public interface IOcrService
{
/// <summary>
/// BCP-47 language codes supported by the OCR service.
/// </summary>
IReadOnlyCollection<string> SupportedLanguages { get; }

/// <summary>
/// Initialize the OCR on the platform
/// </summary>
Expand All @@ -18,9 +23,29 @@ public interface IOcrService
/// <param name="tryHard">True to try and tell the API to be more accurate, otherwise just be fast.</param>
/// <param name="ct">An optional cancellation token</param>
/// <returns>The OCR result</returns>
/// <exception cref="InvalidOperationException"></exception>
/// <exception cref="ArgumentException"></exception>
Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard = false, CancellationToken ct = default);

/// <summary>
/// Takes an image and returns the text found in the image.
/// </summary>
/// <param name="imageData">The image data</param>
/// <param name="options">The options for OCR</param>
/// <param name="ct">An optional cancellation token</param>
/// <returns>The OCR result</returns>
/// <exception cref="InvalidOperationException"></exception>
/// <exception cref="ArgumentException"></exception>
Task<OcrResult> RecognizeTextAsync(byte[] imageData, OcrOptions options, CancellationToken ct = default);
}

/// <summary>
/// The options for OCR.
/// </summary>
/// <param name="Language">The BCP-47 language code</param>
/// <param name="TryHard">True to try and tell the API to be more accurate, otherwise just be fast.</param>
public record OcrOptions(string? Language = null, bool TryHard = false);

/// <summary>
/// The result of an OCR operation.
/// </summary>
Expand Down Expand Up @@ -57,28 +82,28 @@ public class OcrElement
public float Confidence { get; set; }

/// <summary>
/// The text of the element.
/// The height of the element.
/// </summary>
public string Text { get; set; }
public int Height { get; set; }

/// <summary>
/// The X coordinates of the element.
/// The text of the element.
/// </summary>
public int X { get; set; }
public string Text { get; set; }

/// <summary>
/// The Y coordinates of the element.
/// The width of the element.
/// </summary>
public int Y { get; set; }
public int Width { get; set; }

/// <summary>
/// The height of the element.
/// The X coordinates of the element.
/// </summary>
public int Height { get; set; }
public int X { get; set; }

/// <summary>
/// The width of the element.
/// The Y coordinates of the element.
/// </summary>
public int Width { get; set; }
public int Y { get; set; }
}
}
27 changes: 17 additions & 10 deletions src/Plugin.Maui.OCR/OcrImplementation.android.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ namespace Plugin.Maui.OCR;

internal partial class OcrImplementation : IOcrService
{
public IReadOnlyCollection<string> SupportedLanguages => throw new NotImplementedException();

public static OcrResult ProcessOcrResult(Java.Lang.Object result)
{
var ocrResult = new OcrResult();
Expand Down Expand Up @@ -64,6 +66,20 @@ public Task InitAsync(System.Threading.CancellationToken ct = default)
/// <param name="ct">An optional cancellation token</param>
/// <returns>The OCR result</returns>
public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard = false, System.Threading.CancellationToken ct = default)
{
return await RecognizeTextAsync(imageData, new OcrOptions(TryHard: tryHard), ct);
}

private static Task<Java.Lang.Object> ToAwaitableTask(global::Android.Gms.Tasks.Task task)
{
var taskCompletionSource = new TaskCompletionSource<Java.Lang.Object>();
var taskCompleteListener = new TaskCompleteListener(taskCompletionSource);
task.AddOnCompleteListener(taskCompleteListener);

return taskCompletionSource.Task;
}

public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, OcrOptions options, System.Threading.CancellationToken ct = default)
{
var image = BitmapFactory.DecodeByteArray(imageData, 0, imageData.Length);
using var inputImage = InputImage.FromBitmap(image, 0);
Expand All @@ -77,7 +93,7 @@ public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard =

try
{
if (tryHard)
if (options.TryHard)
{
// For more accurate results, use the cloud-based recognizer (requires internet).
textScanner = TextRecognition.GetClient(new TextRecognizerOptions.Builder()
Expand Down Expand Up @@ -118,15 +134,6 @@ public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard =
}
}

private static Task<Java.Lang.Object> ToAwaitableTask(global::Android.Gms.Tasks.Task task)
{
var taskCompletionSource = new TaskCompletionSource<Java.Lang.Object>();
var taskCompleteListener = new TaskCompleteListener(taskCompletionSource);
task.AddOnCompleteListener(taskCompleteListener);

return taskCompletionSource.Task;
}

public class OnFailureListener : Java.Lang.Object, IOnFailureListener
{
public void OnFailure(Java.Lang.Exception e)
Expand Down
165 changes: 114 additions & 51 deletions src/Plugin.Maui.OCR/OcrImplementation.macios.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
using CoreFoundation;
using CoreGraphics;
using Foundation;
using UIKit;
Expand All @@ -10,6 +9,9 @@ partial class OcrImplementation : IOcrService
{
private static readonly object s_initLock = new();
private bool _isInitialized;
private IReadOnlyCollection<string> _supportedLanguages;

public IReadOnlyCollection<string> SupportedLanguages => _supportedLanguages;

/// <summary>
/// Initialize the OCR on the platform
Expand All @@ -25,6 +27,30 @@ public Task InitAsync(CancellationToken ct = default)

// Perform any necessary initialization here.
// Example: Loading models, setting up resources, etc.
if (OperatingSystem.IsIOSVersionAtLeast(14, 2) || OperatingSystem.IsMacOSVersionAtLeast(14, 0, 0))
{
var tcs = new TaskCompletionSource<OcrResult>(TaskCreationOptions.RunContinuationsAsynchronously);
ct.Register(() => tcs.TrySetCanceled());
using var recognizeTextRequest = new VNRecognizeTextRequest((_, error) =>
{
if (error != null)
{
tcs.TrySetException(new Exception(error.LocalizedDescription));
return;
}

if (ct.IsCancellationRequested)
{
tcs.TrySetCanceled(ct);
}
});
var supportedLangs = recognizeTextRequest.GetSupportedRecognitionLanguages(out var langError);
if (langError != null)
{
throw new Exception(langError.LocalizedDescription);
}
_supportedLanguages = new List<string>(supportedLangs.Select(ns => (string)ns)).AsReadOnly();
}

return Task.CompletedTask;
}
Expand All @@ -36,7 +62,71 @@ public Task InitAsync(CancellationToken ct = default)
/// <param name="tryHard">True to try and tell the API to be more accurate, otherwise just be fast.</param>
/// <param name="ct">An optional cancellation token</param>
/// <returns>The OCR result</returns>
/// <exception cref="InvalidOperationException"></exception>
/// <exception cref="ArgumentException"></exception>
public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard = false, CancellationToken ct = default)
{
return await RecognizeTextAsync(imageData, new OcrOptions(TryHard: tryHard), ct);
}

private static OcrResult ProcessRecognitionResults(VNRequest request, CGSize imageSize)
{
var ocrResult = new OcrResult();

var observations = request.GetResults<VNRecognizedTextObservation>();
if (observations == null || observations.Length == 0)
{
ocrResult.Success = false;
return ocrResult;
}

foreach (var observation in observations)
{
var topCandidate = observation.TopCandidates(1).FirstOrDefault();
if (topCandidate != null)
{
ocrResult.AllText += " " + topCandidate.String;
ocrResult.Lines.Add(topCandidate.String);

// Convert the normalized CGRect to image coordinates
var boundingBox = observation.BoundingBox;
var x = (int)(boundingBox.X * imageSize.Width);
var y = (int)((1 - boundingBox.Y - boundingBox.Height) * imageSize.Height); // flip the Y coordinate
var width = (int)(boundingBox.Width * imageSize.Width);
var height = (int)(boundingBox.Height * imageSize.Height);

// Splitting by spaces to create elements might not be accurate for all languages/scripts
topCandidate.String.Split(" ").ToList().ForEach(e => ocrResult.Elements.Add(new OcrResult.OcrElement
{
Text = e,
Confidence = topCandidate.Confidence,
X = x,
Y = y,
Width = width,
Height = height
}));
}
}

ocrResult.Success = true;
return ocrResult;
}

private static UIImage? ImageFromByteArray(byte[] data)
{
return data != null ? new UIImage(NSData.FromArray(data)) : null;
}

/// <summary>
/// Takes an image and returns the text found in the image.
/// </summary>
/// <param name="imageData">The image data</param>
/// <param name="options">The options for OCR</param>
/// <param name="ct">An optional cancellation token</param>
/// <returns>The OCR result</returns>
/// <exception cref="InvalidOperationException"></exception>
/// <exception cref="ArgumentException"></exception>
public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, OcrOptions options, CancellationToken ct = default)
{
if (!_isInitialized)
{
Expand Down Expand Up @@ -71,17 +161,38 @@ public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard =
tcs.TrySetResult(result);
});

switch (tryHard)
switch (options.TryHard)
{
case true:
recognizeTextRequest.RecognitionLevel = VNRequestTextRecognitionLevel.Accurate;
break;

case false:
recognizeTextRequest.RecognitionLevel = VNRequestTextRecognitionLevel.Fast;
break;
}

recognizeTextRequest.UsesLanguageCorrection = tryHard;
// for ios/macos 14.2 or later
if ((!string.IsNullOrEmpty(options.Language) && OperatingSystem.IsIOSVersionAtLeast(14, 2)) || OperatingSystem.IsMacOSVersionAtLeast(10, 15, 2))
{
var supportedLangs = recognizeTextRequest.GetSupportedRecognitionLanguages(out var langError);
if (langError != null)
{
throw new Exception(langError.LocalizedDescription);
}
var supportedLangList = new List<string>(supportedLangs.Select(ns => (string)ns));

if (options.Language is string langString && supportedLangList.Contains(langString))
{
recognizeTextRequest.RecognitionLanguages = new[] { langString };
}
else
{
throw new ArgumentException($"Unsupported language \"{options.Language}\". Supported languages are: ({string.Join(",", supportedLangList)})", nameof(options.Language));
}
}

recognizeTextRequest.UsesLanguageCorrection = options.TryHard;
recognizeTextRequest.UsesCpuOnly = false;
recognizeTextRequest.PreferBackgroundProcessing = true;
recognizeTextRequest.MinimumTextHeight = 0;
Expand All @@ -100,52 +211,4 @@ public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard =

return await tcs.Task;
}

private static OcrResult ProcessRecognitionResults(VNRequest request, CGSize imageSize)
{
var ocrResult = new OcrResult();

var observations = request.GetResults<VNRecognizedTextObservation>();
if (observations == null || observations.Length == 0)
{
ocrResult.Success = false;
return ocrResult;
}

foreach (var observation in observations)
{
var topCandidate = observation.TopCandidates(1).FirstOrDefault();
if (topCandidate != null)
{
ocrResult.AllText += " " + topCandidate.String;
ocrResult.Lines.Add(topCandidate.String);

// Convert the normalized CGRect to image coordinates
var boundingBox = observation.BoundingBox;
var x = (int)(boundingBox.X * imageSize.Width);
var y = (int)((1 - boundingBox.Y - boundingBox.Height) * imageSize.Height); // flip the Y coordinate
var width = (int)(boundingBox.Width * imageSize.Width);
var height = (int)(boundingBox.Height * imageSize.Height);

// Splitting by spaces to create elements might not be accurate for all languages/scripts
topCandidate.String.Split(" ").ToList().ForEach(e => ocrResult.Elements.Add(new OcrResult.OcrElement
{
Text = e,
Confidence = topCandidate.Confidence,
X = x,
Y = y,
Width = width,
Height = height
}));
}
}

ocrResult.Success = true;
return ocrResult;
}

private static UIImage? ImageFromByteArray(byte[] data)
{
return data != null ? new UIImage(NSData.FromArray(data)) : null;
}
}
9 changes: 8 additions & 1 deletion src/Plugin.Maui.OCR/OcrImplementation.net.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ namespace Plugin.Maui.OCR;
// This usually is a placeholder as .NET MAUI apps typically don't run on .NET generic targets unless through unit tests and such
partial class OcrImplementation : IOcrService
{
public IReadOnlyCollection<string> SupportedLanguages => throw new NotImplementedException();

/// <summary>
/// Initialize the OCR on the platform
/// </summary>
Expand All @@ -19,7 +21,12 @@ public Task InitAsync(CancellationToken ct = default)
/// <param name="tryHard">No effect.</param>
/// <param name="ct">An optional cancellation token</param>
/// <returns>The OCR result</returns>
public Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard = false, CancellationToken ct = default)
public async Task<OcrResult> RecognizeTextAsync(byte[] imageData, bool tryHard = false, CancellationToken ct = default)
{
return await RecognizeTextAsync(imageData, new OcrOptions(null, tryHard), ct);
}

public Task<OcrResult> RecognizeTextAsync(byte[] imageData, OcrOptions options, CancellationToken ct = default)
{
throw new NotImplementedException();
}
Expand Down
Loading

0 comments on commit 00f4e19

Please sign in to comment.