Skip to content

Commit

Permalink
adding example for PDF to image conversion in a notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
s-emanuilov committed Sep 16, 2024
1 parent 8b89863 commit dacbe5f
Showing 1 changed file with 174 additions and 0 deletions.
174 changes: 174 additions & 0 deletions examples/convert_pdf_to_images.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "cc9d9bf6-9da8-4676-b8f9-2813c59df9f4",
"metadata": {},
"source": [
"# PDF to image conversion utility\n",
"\n",
"This notebook provides a utility function to convert PDF files into images using the `pdf2image` library.\n",
"\n",
"## Installation instructions\n",
"\n",
"Before running this notebook, you need to install some dependencies. Run the following commands in your terminal based on your operating system:\n",
"\n",
"### For Linux (Ubuntu/Debian):\n",
"```bash\n",
"sudo apt-get update\n",
"sudo apt-get install poppler-utils\n",
"```\n",
"\n",
"### For MacOS:\n",
"```bash\n",
"brew install poppler\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "efce81109fdba0b3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-16T02:45:34.764142Z",
"start_time": "2024-09-16T02:45:33.616190Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pdf2image in /Users/sim/Projects/litepali/venv/lib/python3.11/site-packages (1.17.0)\r\n",
"Requirement already satisfied: pillow in /Users/sim/Projects/litepali/venv/lib/python3.11/site-packages (from pdf2image) (10.4.0)\r\n"
]
}
],
"source": [
"!pip install pdf2image"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "16ae2110e9aa484e",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-16T02:45:38.554681Z",
"start_time": "2024-09-16T02:45:38.360640Z"
}
},
"outputs": [],
"source": [
"import os\n",
"import multiprocessing\n",
"from pdf2image import convert_from_path"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "22bcd30911bc84c9",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-16T02:45:40.706663Z",
"start_time": "2024-09-16T02:45:40.701061Z"
}
},
"outputs": [],
"source": [
"def convert_pdf_to_images(pdf_path, output_folder, dpi=300, thread_count=None):\n",
" \"\"\"\n",
" Convert a PDF file to a series of images.\n",
" \n",
" :param pdf_path: Path to the PDF file\n",
" :param output_folder: Folder to save the output images\n",
" :param dpi: DPI for the output images (default: 300)\n",
" :param thread_count: Number of threads to use (default: auto-detect)\n",
" \"\"\"\n",
" # Create output folder if it doesn't exist\n",
" os.makedirs(output_folder, exist_ok=True)\n",
" \n",
" # Auto-discover thread count if not provided\n",
" if thread_count is None:\n",
" thread_count = multiprocessing.cpu_count()\n",
" \n",
" # Convert PDF to images\n",
" images = convert_from_path(\n",
" pdf_path,\n",
" dpi=dpi,\n",
" thread_count=thread_count\n",
" )\n",
" \n",
" # Save images\n",
" for i, image in enumerate(images):\n",
" image_path = os.path.join(output_folder, f'page_{i+1}.png')\n",
" image.save(image_path, 'PNG')\n",
" \n",
" print(f\"Converted {len(images)} pages to images in {output_folder}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "379872a03498e63c",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-16T02:48:17.482166Z",
"start_time": "2024-09-16T02:48:17.476042Z"
}
},
"outputs": [],
"source": [
"# Adapt your paths\n",
"pdf_path = 'input.pdf'\n",
"output_folder = 'results'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "61b78ee6f9514596",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-16T02:49:36.632924Z",
"start_time": "2024-09-16T02:48:21.096150Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Converted 218 pages to images in results\n"
]
}
],
"source": [
"convert_pdf_to_images(pdf_path, output_folder)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit dacbe5f

Please sign in to comment.