diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88698867..a95486be 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,6 +1,11 @@ name: Test - -on: [push, pull_request] +on: + push: + branches: + - testing + pull_request: + branches: + - testing jobs: test: @@ -9,22 +14,36 @@ jobs: strategy: matrix: python-version: [3.8, 3.9, 3.10.x, 3.11, 3.12] - os: [ubuntu-latest] + os: [ubuntu-latest, windows-latest] steps: - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + - name: Install system dependencies (Linux) + if: runner.os == 'Linux' run: | sudo apt update sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript pip install -U ocrmypdf + + - name: Install system dependencies (Windows) + if: runner.os == 'Windows' + run: | + choco install tesseract poppler imagemagick ghostscript + pip install -U ocrmypdf + - name: Install testing dependencies run: | pip install -U wheel pip pip install --editable ".[test]" + + - name: Lint with flake8 + run: flake8 + - name: Test with pytest - run: pytest + run: pytest \ No newline at end of file diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py index 7bea12dd..b5fbef01 100644 --- a/src/invoice2data/input/tesseract.py +++ b/src/invoice2data/input/tesseract.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- - +import platform import shutil import tempfile import mimetypes @@ -32,17 +32,21 @@ def to_text(path: str, area_details: dict = None): """ # Check for dependencies. Needs Tesseract and Imagemagick installed. + current_platform = platform.platform() + if current_platform.startswith("win32"): + convert_command_prefix = "magick" + else: + convert_command_prefix = "convert" if not shutil.which("tesseract"): raise EnvironmentError("tesseract not installed.") - if not shutil.which("convert"): + if not shutil.which(convert_command_prefix): raise EnvironmentError("imagemagick not installed.") language = get_languages() logger.debug("tesseract language arg is, %s", language) timeout = 180 - # convert the (multi-page) pdf file to a 300dpi png - convert = [ + convert = [convert_command_prefix] + [ "convert", "-units", "PixelsPerInch",