Skip to content

Commit

Permalink
init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lltx committed Dec 17, 2024
0 parents commit 2ffbb52
Show file tree
Hide file tree
Showing 11 changed files with 424 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# API配置
API_KEY=your_api_key_here
BASE_URL=https://open.bigmodel.cn/api/paas/v4
MODEL=glm-4v-flash
PROMPT=提取图片中全部的文本,不需要任何推理和总结,只需要原文

# 文件处理配置
FILE_DELETE_DELAY=300 # 5分钟后删除临时文件
49 changes: 49 additions & 0 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Docker Build and Push

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: |
API_KEY=9dfee4d072cf964f117403933da51242.1s1Tvn56OIGQkvmT
BASE_URL=https://open.bigmodel.cn/api/paas/v4
MODEL=glm-4v-flash
PROMPT=提取图片中全部的文本,不需要任何推理和总结,只需要原文
FILE_DELETE_DELAY=300
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
__pycache__/
*.py[cod]
*$py.class

# 环境文件
.env

# 上传文件目录
files/

# 虚拟环境
venv/
36 changes: 36 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# 使用 Python 官方镜像作为基础镜像
FROM python:3.13-alpine

USER root

# 定义构建参数
ARG API_KEY
ARG BASE_URL
ARG MODEL
ARG PROMPT
ARG FILE_DELETE_DELAY

# 设置环境变量
ENV API_KEY=${API_KEY}
ENV BASE_URL=${BASE_URL}
ENV MODEL=${MODEL}
ENV PROMPT=${PROMPT}
ENV FILE_DELETE_DELAY=${FILE_DELETE_DELAY}
ENV PYTHONUNBUFFERED=1

# Runtime dependency
RUN apk add --no-cache ffmpeg

# 设置工作目录
WORKDIR /app

# 复制项目文件到容器中
COPY . .

RUN pip3 install -r requirements.txt

# 暴露端口(修改为 8000)
EXPOSE 8000

# 使用 uvicorn 运行 FastAPI 应用,端口改为 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
72 changes: 72 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Image Text Extraction Service

这是一个基于 FastAPI 的图片文本提取服务,支持智谱 AI 的 GLM-4V 模型和阿里云百炼平台的 Qwen-VL-Max 模型进行图片文本识别。

## 环境变量说明

服务支持以下环境变量配置:

| 环境变量 | 说明 | 默认值 |
|---------|------|--------|
| API_KEY | AI 平台的 API 密钥 | XXXX |
| BASE_URL | AI 平台的 API 基础 URL | https://open.bigmodel.cn/api/paas/v4 |
| MODEL | 使用的模型名称 | glm-4v-flash |
| DELETE_DELAY | 临时文件删除延迟(秒) | 600 |
| PROMPT | 文本提取提示词 | 提取图片中全部的文本,不需要任何推理和总结,只需要原文 |

### 支持的模型配置

#### 智谱 AI
- MODEL=glm-4v-flash
- BASE_URL=https://open.bigmodel.cn/api/paas/v4

#### 阿里云百炼
- MODEL=qwen-vl-max
- BASE_URL=https://dashscope.aliyuncs.com/api/v1

## Docker 使用说明

### 1. 快速使用(默认使用智谱 AI)

```bash
docker run -p 8000:8000 pig4cloud/markitdown
```

### 2. 使用阿里云百炼平台

```bash
docker run -d \
-p 8000:8000 \
-e API_KEY=your_aliyun_api_key \
-e MODEL=qwen-vl-max \
-e BASE_URL=https://dashscope.aliyuncs.com/api/v1 \
pig4cloud/markitdown
```

## API 接口

### 上传图片并提取文本

**Endpoint:** POST /upload/

**请求格式:** multipart/form-data

**参数:**
- file: 图片文件

**响应示例:**
```json
{
"new_filename": "1679012345.jpg",
"content": "提取的文本内容"
}
```

## 注意事项

1. 使用前请确保已获取相应平台的 API 密钥
2. 智谱 AI 和阿里云百炼平台的接口略有不同,请确保使用正确的配置
3. 上传的图片文件会在处理后自动删除(默认10分钟)
4. 服务默认监听 8000 端口


140 changes: 140 additions & 0 deletions app/api/endpoints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import os
import time
import asyncio
import fitz # PyMuPDF
from fastapi import APIRouter, File, UploadFile, status
from typing import Dict, Optional, Tuple
from openai import OpenAI
from markitdown import MarkItDown
import tempfile

from app.config import (
API_KEY,
BASE_URL,
MODEL,
FILE_DELETE_DELAY,
MLM_PROMPT
)
from app.utils.file_utils import save_upload_file, delete_files

router = APIRouter()

client = OpenAI(
base_url=BASE_URL,
api_key=API_KEY
)

async def process_pdf_page(
page: fitz.Page,
markitdown: MarkItDown,
mlm_prompt: str
) -> Optional[str]:
"""处理单个PDF页面,尝试提取文本或进行OCR识别
Args:
page: PDF页面对象
markitdown: MarkItDown实例
mlm_prompt: 文本提取提示
Returns:
提取的文本内容,如果提取失败返回None
"""
# 首先尝试直接提取文本
text = page.get_text().strip()
if text:
return text

# 如果直接提取失败,转换为图片进行OCR
try:
pix = page.get_pixmap()
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
pix.save(temp_img.name)
result = markitdown.convert(temp_img.name, mlm_prompt=mlm_prompt)
# 清理临时文件
os.unlink(temp_img.name)
return result.text_content
except Exception as e:
print(f"页面处理错误: {str(e)}")
return None

async def extract_pdf_text(
pdf_path: str,
markitdown: MarkItDown,
mlm_prompt: str
) -> Tuple[bool, str]:
"""从PDF文件中提取文本
Args:
pdf_path: PDF文件路径
markitdown: MarkItDown实例
mlm_prompt: 文本提取提示
Returns:
(是否成功, 提取的文本内容)
"""
try:
pdf_document = fitz.open(pdf_path)
all_text = []

for page_num in range(len(pdf_document)):
text = await process_pdf_page(
pdf_document[page_num],
markitdown,
mlm_prompt
)
if text:
all_text.append(text)

pdf_document.close()

if all_text:
return True, '\n'.join(all_text)
return False, ""

except Exception as e:
print(f"PDF处理错误: {str(e)}")
return False, ""

@router.post("/upload/",
response_model=Dict[str, str],
status_code=status.HTTP_200_OK,
summary="上传文件",
description="上传图片或PDF文件并提取其中的文本内容",
responses={
200: {
"description": "成功提取文本",
"content": {
"application/json": {
"example": {
"text": "提取的文本内容"
}
}
}
}
}
)
async def upload_file(
file: UploadFile = File(..., description="要上传的文件,支持常见图片格式和PDF文件")
):
timestamp = int(time.time())
file_extension = os.path.splitext(file.filename)[1].lower()
new_filename = f"{timestamp}{file_extension}"

content = await file.read()
file_path = await save_upload_file(content, new_filename)

markitdown = MarkItDown(mlm_client=client, mlm_model=MODEL)
result = markitdown.convert(file_path, mlm_prompt=MLM_PROMPT)

# 如果是PDF文件且未提取到文本,则尝试其他方法
if file_extension == '.pdf' and not result.text_content:
success, text = await extract_pdf_text(file_path, markitdown, MLM_PROMPT)
if success:
result.text_content = text

# 创建异步任务删除临时文件
asyncio.create_task(delete_files(file_path, "", FILE_DELETE_DELAY))

return {
"text": result.text_content or ""
}
20 changes: 20 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
from dotenv import load_dotenv

# 加载.env文件
load_dotenv()

# API配置
API_KEY = os.getenv('API_KEY')
BASE_URL = os.getenv('BASE_URL')
MODEL = os.getenv('MODEL')

# 文件处理配置
FILE_DELETE_DELAY = int(os.getenv('FILE_DELETE_DELAY', 300)) # 默认5分钟
MLM_PROMPT = os.getenv('PROMPT')

# 验证必需的环境变量
required_vars = ['API_KEY', 'BASE_URL', 'MODEL', 'PROMPT']
missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
22 changes: 22 additions & 0 deletions app/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import asyncio
import aiofiles
from typing import Optional

async def save_upload_file(file_content: bytes, filename: str, directory: str = "files") -> str:
"""保存上传的文件"""
if not os.path.exists(directory):
os.makedirs(directory)

file_path = os.path.join(directory, filename)
async with aiofiles.open(file_path, 'wb') as out_file:
await out_file.write(file_content)
return file_path

async def delete_files(file_path: str, output_path: Optional[str], delay: int):
"""延迟删除文件"""
await asyncio.sleep(delay)
if os.path.exists(file_path):
os.remove(file_path)
if output_path and os.path.exists(output_path):
os.remove(output_path)
8 changes: 8 additions & 0 deletions banner.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
+----------------------------------------+
| |
| 🚀 MarkItDown API Server |
| |
| ✨ Server is running... |
| 🌐http://localhost:8000/docs |
| |
+----------------------------------------+
Loading

0 comments on commit 2ffbb52

Please sign in to comment.