diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..88fe85b --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# API配置 +API_KEY=your_api_key_here +BASE_URL=https://open.bigmodel.cn/api/paas/v4 +MODEL=glm-4v-flash +PROMPT=提取图片中全部的文本,不需要任何推理和总结,只需要原文 + +# 文件处理配置 +FILE_DELETE_DELAY=300 # 5分钟后删除临时文件 \ No newline at end of file diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000..7fd73b1 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,49 @@ +name: Docker Build and Push + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + API_KEY=9dfee4d072cf964f117403933da51242.1s1Tvn56OIGQkvmT + BASE_URL=https://open.bigmodel.cn/api/paas/v4 + MODEL=glm-4v-flash + PROMPT=提取图片中全部的文本,不需要任何推理和总结,只需要原文 + FILE_DELETE_DELAY=300 \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..55046f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +__pycache__/ +*.py[cod] +*$py.class + +# 环境文件 +.env + +# 上传文件目录 +files/ + +# 虚拟环境 +venv/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e00b5ec --- /dev/null +++ b/Dockerfile @@ -0,0 +1,36 @@ +# 使用 Python 官方镜像作为基础镜像 +FROM python:3.13-alpine + +USER root + +# 定义构建参数 +ARG API_KEY +ARG BASE_URL +ARG MODEL +ARG PROMPT +ARG FILE_DELETE_DELAY + +# 设置环境变量 +ENV API_KEY=${API_KEY} +ENV BASE_URL=${BASE_URL} +ENV MODEL=${MODEL} +ENV PROMPT=${PROMPT} +ENV FILE_DELETE_DELAY=${FILE_DELETE_DELAY} +ENV PYTHONUNBUFFERED=1 + +# Runtime dependency +RUN apk add --no-cache ffmpeg + +# 设置工作目录 +WORKDIR /app + +# 复制项目文件到容器中 +COPY . . + +RUN pip3 install -r requirements.txt + +# 暴露端口(修改为 8000) +EXPOSE 8000 + +# 使用 uvicorn 运行 FastAPI 应用,端口改为 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..1c15ab9 --- /dev/null +++ b/README @@ -0,0 +1,72 @@ +# Image Text Extraction Service + +这是一个基于 FastAPI 的图片文本提取服务,支持智谱 AI 的 GLM-4V 模型和阿里云百炼平台的 Qwen-VL-Max 模型进行图片文本识别。 + +## 环境变量说明 + +服务支持以下环境变量配置: + +| 环境变量 | 说明 | 默认值 | +|---------|------|--------| +| API_KEY | AI 平台的 API 密钥 | XXXX | +| BASE_URL | AI 平台的 API 基础 URL | https://open.bigmodel.cn/api/paas/v4 | +| MODEL | 使用的模型名称 | glm-4v-flash | +| DELETE_DELAY | 临时文件删除延迟(秒) | 600 | +| PROMPT | 文本提取提示词 | 提取图片中全部的文本,不需要任何推理和总结,只需要原文 | + +### 支持的模型配置 + +#### 智谱 AI +- MODEL=glm-4v-flash +- BASE_URL=https://open.bigmodel.cn/api/paas/v4 + +#### 阿里云百炼 +- MODEL=qwen-vl-max +- BASE_URL=https://dashscope.aliyuncs.com/api/v1 + +## Docker 使用说明 + +### 1. 快速使用(默认使用智谱 AI) + +```bash +docker run -p 8000:8000 pig4cloud/markitdown +``` + +### 2. 使用阿里云百炼平台 + +```bash +docker run -d \ + -p 8000:8000 \ + -e API_KEY=your_aliyun_api_key \ + -e MODEL=qwen-vl-max \ + -e BASE_URL=https://dashscope.aliyuncs.com/api/v1 \ + pig4cloud/markitdown +``` + +## API 接口 + +### 上传图片并提取文本 + +**Endpoint:** POST /upload/ + +**请求格式:** multipart/form-data + +**参数:** +- file: 图片文件 + +**响应示例:** +```json +{ + "new_filename": "1679012345.jpg", + "content": "提取的文本内容" +} +``` + +## 注意事项 + +1. 使用前请确保已获取相应平台的 API 密钥 +2. 智谱 AI 和阿里云百炼平台的接口略有不同,请确保使用正确的配置 +3. 上传的图片文件会在处理后自动删除(默认10分钟) +4. 服务默认监听 8000 端口 + + diff --git a/app/api/endpoints.py b/app/api/endpoints.py new file mode 100644 index 0000000..f341ed3 --- /dev/null +++ b/app/api/endpoints.py @@ -0,0 +1,140 @@ +import os +import time +import asyncio +import fitz # PyMuPDF +from fastapi import APIRouter, File, UploadFile, status +from typing import Dict, Optional, Tuple +from openai import OpenAI +from markitdown import MarkItDown +import tempfile + +from app.config import ( + API_KEY, + BASE_URL, + MODEL, + FILE_DELETE_DELAY, + MLM_PROMPT +) +from app.utils.file_utils import save_upload_file, delete_files + +router = APIRouter() + +client = OpenAI( + base_url=BASE_URL, + api_key=API_KEY +) + +async def process_pdf_page( + page: fitz.Page, + markitdown: MarkItDown, + mlm_prompt: str +) -> Optional[str]: + """处理单个PDF页面,尝试提取文本或进行OCR识别 + + Args: + page: PDF页面对象 + markitdown: MarkItDown实例 + mlm_prompt: 文本提取提示 + + Returns: + 提取的文本内容,如果提取失败返回None + """ + # 首先尝试直接提取文本 + text = page.get_text().strip() + if text: + return text + + # 如果直接提取失败,转换为图片进行OCR + try: + pix = page.get_pixmap() + with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img: + pix.save(temp_img.name) + result = markitdown.convert(temp_img.name, mlm_prompt=mlm_prompt) + # 清理临时文件 + os.unlink(temp_img.name) + return result.text_content + except Exception as e: + print(f"页面处理错误: {str(e)}") + return None + +async def extract_pdf_text( + pdf_path: str, + markitdown: MarkItDown, + mlm_prompt: str +) -> Tuple[bool, str]: + """从PDF文件中提取文本 + + Args: + pdf_path: PDF文件路径 + markitdown: MarkItDown实例 + mlm_prompt: 文本提取提示 + + Returns: + (是否成功, 提取的文本内容) + """ + try: + pdf_document = fitz.open(pdf_path) + all_text = [] + + for page_num in range(len(pdf_document)): + text = await process_pdf_page( + pdf_document[page_num], + markitdown, + mlm_prompt + ) + if text: + all_text.append(text) + + pdf_document.close() + + if all_text: + return True, '\n'.join(all_text) + return False, "" + + except Exception as e: + print(f"PDF处理错误: {str(e)}") + return False, "" + +@router.post("/upload/", + response_model=Dict[str, str], + status_code=status.HTTP_200_OK, + summary="上传文件", + description="上传图片或PDF文件并提取其中的文本内容", + responses={ + 200: { + "description": "成功提取文本", + "content": { + "application/json": { + "example": { + "text": "提取的文本内容" + } + } + } + } + } +) +async def upload_file( + file: UploadFile = File(..., description="要上传的文件,支持常见图片格式和PDF文件") +): + timestamp = int(time.time()) + file_extension = os.path.splitext(file.filename)[1].lower() + new_filename = f"{timestamp}{file_extension}" + + content = await file.read() + file_path = await save_upload_file(content, new_filename) + + markitdown = MarkItDown(mlm_client=client, mlm_model=MODEL) + result = markitdown.convert(file_path, mlm_prompt=MLM_PROMPT) + + # 如果是PDF文件且未提取到文本,则尝试其他方法 + if file_extension == '.pdf' and not result.text_content: + success, text = await extract_pdf_text(file_path, markitdown, MLM_PROMPT) + if success: + result.text_content = text + + # 创建异步任务删除临时文件 + asyncio.create_task(delete_files(file_path, "", FILE_DELETE_DELAY)) + + return { + "text": result.text_content or "" + } \ No newline at end of file diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..99fae74 --- /dev/null +++ b/app/config.py @@ -0,0 +1,20 @@ +import os +from dotenv import load_dotenv + +# 加载.env文件 +load_dotenv() + +# API配置 +API_KEY = os.getenv('API_KEY') +BASE_URL = os.getenv('BASE_URL') +MODEL = os.getenv('MODEL') + +# 文件处理配置 +FILE_DELETE_DELAY = int(os.getenv('FILE_DELETE_DELAY', 300)) # 默认5分钟 +MLM_PROMPT = os.getenv('PROMPT') + +# 验证必需的环境变量 +required_vars = ['API_KEY', 'BASE_URL', 'MODEL', 'PROMPT'] +missing_vars = [var for var in required_vars if not os.getenv(var)] +if missing_vars: + raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}") \ No newline at end of file diff --git a/app/utils/file_utils.py b/app/utils/file_utils.py new file mode 100644 index 0000000..75aa1f1 --- /dev/null +++ b/app/utils/file_utils.py @@ -0,0 +1,22 @@ +import os +import asyncio +import aiofiles +from typing import Optional + +async def save_upload_file(file_content: bytes, filename: str, directory: str = "files") -> str: + """保存上传的文件""" + if not os.path.exists(directory): + os.makedirs(directory) + + file_path = os.path.join(directory, filename) + async with aiofiles.open(file_path, 'wb') as out_file: + await out_file.write(file_content) + return file_path + +async def delete_files(file_path: str, output_path: Optional[str], delay: int): + """延迟删除文件""" + await asyncio.sleep(delay) + if os.path.exists(file_path): + os.remove(file_path) + if output_path and os.path.exists(output_path): + os.remove(output_path) \ No newline at end of file diff --git a/banner.txt b/banner.txt new file mode 100644 index 0000000..f9a4bdf --- /dev/null +++ b/banner.txt @@ -0,0 +1,8 @@ ++----------------------------------------+ +| | +| 🚀 MarkItDown API Server | +| | +| ✨ Server is running... | +| 🌐http://localhost:8000/docs | +| | ++----------------------------------------+ \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..c3cfdc6 --- /dev/null +++ b/main.py @@ -0,0 +1,48 @@ +from pathlib import Path +from contextlib import asynccontextmanager +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from fastapi import HTTPException + +from app.api.endpoints import router + +@asynccontextmanager +async def lifespan(app: FastAPI): + # 启动时执行 + banner_path = Path(__file__).parent / 'banner.txt' + try: + with open(banner_path, 'r', encoding='utf-8') as f: + banner = f.read() + print(banner) + except FileNotFoundError: + print("Banner file not found, starting server without banner...") + yield + +def create_app() -> FastAPI: + app = FastAPI( + lifespan=lifespan + ) + + # Add CORS middleware + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + # 注册路由 + app.include_router(router) + + @app.exception_handler(HTTPException) + async def http_exception_handler(request, exc): + return JSONResponse( + status_code=exc.status_code, + content={"message": exc.detail}, + ) + + return app + +app = create_app() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bf0549c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +fastapi +aiofiles +markitdown +uvicorn +openai +socksio +PyMuPDF +python-multipart +python-dotenv==1.0.1