-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding example for PDF to image conversion in a notebook
- Loading branch information
1 parent
8b89863
commit dacbe5f
Showing
1 changed file
with
174 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "cc9d9bf6-9da8-4676-b8f9-2813c59df9f4", | ||
"metadata": {}, | ||
"source": [ | ||
"# PDF to image conversion utility\n", | ||
"\n", | ||
"This notebook provides a utility function to convert PDF files into images using the `pdf2image` library.\n", | ||
"\n", | ||
"## Installation instructions\n", | ||
"\n", | ||
"Before running this notebook, you need to install some dependencies. Run the following commands in your terminal based on your operating system:\n", | ||
"\n", | ||
"### For Linux (Ubuntu/Debian):\n", | ||
"```bash\n", | ||
"sudo apt-get update\n", | ||
"sudo apt-get install poppler-utils\n", | ||
"```\n", | ||
"\n", | ||
"### For MacOS:\n", | ||
"```bash\n", | ||
"brew install poppler\n", | ||
"```" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "efce81109fdba0b3", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-09-16T02:45:34.764142Z", | ||
"start_time": "2024-09-16T02:45:33.616190Z" | ||
} | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: pdf2image in /Users/sim/Projects/litepali/venv/lib/python3.11/site-packages (1.17.0)\r\n", | ||
"Requirement already satisfied: pillow in /Users/sim/Projects/litepali/venv/lib/python3.11/site-packages (from pdf2image) (10.4.0)\r\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install pdf2image" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "16ae2110e9aa484e", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-09-16T02:45:38.554681Z", | ||
"start_time": "2024-09-16T02:45:38.360640Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import multiprocessing\n", | ||
"from pdf2image import convert_from_path" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "22bcd30911bc84c9", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-09-16T02:45:40.706663Z", | ||
"start_time": "2024-09-16T02:45:40.701061Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def convert_pdf_to_images(pdf_path, output_folder, dpi=300, thread_count=None):\n", | ||
" \"\"\"\n", | ||
" Convert a PDF file to a series of images.\n", | ||
" \n", | ||
" :param pdf_path: Path to the PDF file\n", | ||
" :param output_folder: Folder to save the output images\n", | ||
" :param dpi: DPI for the output images (default: 300)\n", | ||
" :param thread_count: Number of threads to use (default: auto-detect)\n", | ||
" \"\"\"\n", | ||
" # Create output folder if it doesn't exist\n", | ||
" os.makedirs(output_folder, exist_ok=True)\n", | ||
" \n", | ||
" # Auto-discover thread count if not provided\n", | ||
" if thread_count is None:\n", | ||
" thread_count = multiprocessing.cpu_count()\n", | ||
" \n", | ||
" # Convert PDF to images\n", | ||
" images = convert_from_path(\n", | ||
" pdf_path,\n", | ||
" dpi=dpi,\n", | ||
" thread_count=thread_count\n", | ||
" )\n", | ||
" \n", | ||
" # Save images\n", | ||
" for i, image in enumerate(images):\n", | ||
" image_path = os.path.join(output_folder, f'page_{i+1}.png')\n", | ||
" image.save(image_path, 'PNG')\n", | ||
" \n", | ||
" print(f\"Converted {len(images)} pages to images in {output_folder}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "379872a03498e63c", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-09-16T02:48:17.482166Z", | ||
"start_time": "2024-09-16T02:48:17.476042Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Adapt your paths\n", | ||
"pdf_path = 'input.pdf'\n", | ||
"output_folder = 'results'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "61b78ee6f9514596", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-09-16T02:49:36.632924Z", | ||
"start_time": "2024-09-16T02:48:21.096150Z" | ||
} | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Converted 218 pages to images in results\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"convert_pdf_to_images(pdf_path, output_folder)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |