Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Переход на FastAPI #1

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .envrc

This file was deleted.

17 changes: 7 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
FROM python:3.11-slim
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.11-slim

# lint it:
# $ docker run --rm -i hadolint/hadolint < Dockerfile

# hadolint ignore=DL3008
RUN apt-get update -y && apt-get install -y --no-install-recommends git gcc g++ curl && rm -rf /var/lib/apt/lists/*
# hadolint ignore=DL3013
RUN pip install --no-cache-dir --upgrade pip

WORKDIR /app

COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# download fasttext pretrained model
RUN mkdir -p /fasttext && curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin > /fasttext/lid.176.bin
# for fasttext pretrained model & nltk tokenizers
ENV MODEL_PATH=/fasttext/lid.176.bin

# download nltk punkt
RUN python -c "import nltk; nltk.download('punkt')"
RUN mkdir -p /fasttext && mkdir -p /root/nltk_data

VOLUME ["/fasttext"]
VOLUME ["/root/nltk_data"]

COPY . /app
RUN pip install --no-cache-dir -e .

CMD ["dnlp"]
COPY ./src/dnlp /app
COPY ./prestart.sh /app
10 changes: 6 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@ version: '3.7'

services:
dnlp:
restart: always
restart: unless-stopped
build: .
ports:
- "9090:9090"
- "9090:80"
expose:
- 9090
logging:
driver: "none"
volumes:
- fasttext:/fasttext
- nltk_data:/root/nltk_data
environment:
LOG_LEVEL: "warning"
ACCESS_LOG: ""
GUNICORN_CMD_ARGS: "--max-requests=1000 max_requests_jitter=100"

volumes:
fasttext:
Expand Down
16 changes: 16 additions & 0 deletions prestart.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

set -eu
set -x

# download fasttext pretrained model
if [ ! -f "/fasttext/lid.176.bin" ]
then
curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin > /fasttext/lid.176.bin
fi

# download nltk punkt
if [ ! -f "/root/nltk_data/tokenizers/punkt.zip" ]
then
python3 -c "import nltk; nltk.download('punkt')"
fi
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[tool.ruff]
line-length = 120
target-version = 'py311'
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
aiohttp==3.8.4
-e git+https://github.com/facebookresearch/fastText.git@9ef22d#egg=fastText
ftfy==6.1.1
unicodedata2==15.0.0
nltk==3.8.1
trafilatura==1.4.1
Levenshtein==0.20.9
fastapi==0.92.0
18 changes: 0 additions & 18 deletions setup.py

This file was deleted.

30 changes: 0 additions & 30 deletions src/dnlp/app.py

This file was deleted.

6 changes: 6 additions & 0 deletions src/dnlp/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from fastapi import HTTPException as FastAPIHTTPException


class HTTPException(FastAPIHTTPException):
def __init__(self, detail: str) -> None:
super().__init__(status_code=400, detail=detail)
205 changes: 85 additions & 120 deletions src/dnlp/handlers.py
Original file line number Diff line number Diff line change
@@ -1,151 +1,116 @@
import asyncio
import os

from aiohttp.web import json_response
from aiohttp.web_response import Response
from fasttext import load_model as ft_load_model
from nltk.data import load as nltk_load
from fastapi import APIRouter, Body, Form
from fastapi.responses import PlainTextResponse
from trafilatura.core import extract as trafilatura_extract
from trafilatura.settings import use_config

from dnlp.helpers import abort, deduplicate_sentences
from dnlp.languages import PUNKT_LANGUAGES
from dnlp.postprocess import remap_prediction
from dnlp.preprocess import fix_bad_unicode, normalize_html, normalize_whitespace, preprocess_text


# fastText
MODEL_PATH = os.environ.get('MODEL_PATH', None)
if not MODEL_PATH:
raise RuntimeError('Environment variable "MODEL_PATH" empty')
FT_MODEL = ft_load_model(MODEL_PATH)

# nltk punkt
SENT_TOKENIZER = {}
from exceptions import HTTPException
from helpers import (
deduplicate_sentences,
get_fasttext_model,
get_nltk,
get_trafilatura_config,
remap_fasttext_prediction,
)
from languages import PUNKT_LANGUAGES, PunktLanguagesEnum
from preprocess import fix_bad_unicode, normalize_html, normalize_whitespace, preprocess_text
from response_examples import deduplicate_responses, detect_responses, extract_responses, tokenize_responses

# trafilatura config
TRAFILATURA_CONFIG = use_config()
TRAFILATURA_CONFIG.set('DEFAULT', 'EXTRACTION_TIMEOUT', '0')

router = APIRouter()

async def tokenize(request):
post_data = await request.post()
param_text = post_data.get('text', '')
FASTTEXT_MODEL = get_fasttext_model()
TRAFILATURA_CONFIG = get_trafilatura_config()

param_text = fix_bad_unicode(param_text, normalization='NFC')
param_text = normalize_whitespace(param_text)
param_text = param_text.strip()

if not param_text:
return abort('empty "text" parameter')
@router.post(
'/detect',
summary='Определение языка текста',
responses=detect_responses,
)
def detect(
text: str = Form(description='Текст для определения языка'),
count: int = Form(description='Количество результатов', default=3),
threshold: float = Form(description='Пороговое значение', ge=0.0, le=1.0, default=0.01),
):

param_lang = post_data.get('lang', 'en')
# Предобработка текста (удаление ссылок, переносов строк, понижение регистра и тд).
# Может аффектить на скорость из-за регулярок.
# В будущем можно сделать отключение части обработчиков (после бенча на сколько это вообще проблема).
text = preprocess_text(text)

if param_lang in PUNKT_LANGUAGES.keys():
if param_lang not in SENT_TOKENIZER.keys():
# first tokenizer load (may be slow)
SENT_TOKENIZER[param_lang] = nltk_load(f'tokenizers/punkt/{PUNKT_LANGUAGES[param_lang]}.pickle')
else:
return abort('unknown language code')
if not text:
raise HTTPException(detail='Content of parameter `text` is empty after preprocessing')

loop = asyncio.get_event_loop()
sentences = await loop.run_in_executor(
executor=None,
func=lambda: SENT_TOKENIZER[param_lang].tokenize(
param_text
),
)

if not sentences:
return abort('tokenization error')
prediction = FASTTEXT_MODEL.predict(text, k=count, threshold=threshold)

return json_response(sentences)
if not prediction:
raise HTTPException(detail='Detection error')

return remap_fasttext_prediction(prediction)

async def detect(request):
post_data = await request.post()
param_text = post_data.get('text', '')

# preprocessing (normalization)
param_text = preprocess_text(param_text)
if not param_text:
return abort('empty "text" parameter')
@router.post(
'/tokenize',
summary='Разделение текста на предложения',
responses=tokenize_responses,
)
def tokenize(
text: str = Form(description='Текст для разбивки на предложения'),
lang: PunktLanguagesEnum = Form(description='Язык текста', default=PunktLanguagesEnum.en),
):

param_count = post_data.get('count', 3)
param_count = int(param_count)
text = fix_bad_unicode(text, normalization='NFC')
text = normalize_whitespace(text)
text = text.strip()

loop = asyncio.get_event_loop()
prediction = await loop.run_in_executor(
executor=None,
func=lambda: FT_MODEL.predict(
param_text,
k=param_count,
threshold=0.01
),
)
if not text:
raise HTTPException(detail='Content of parameter `text` is empty after preprocessing')

if not prediction:
return abort('detection error')
nltk = get_nltk(PUNKT_LANGUAGES[lang.value])
sentences = nltk.tokenize(text)

return json_response(
remap_prediction(prediction)
)
if not sentences:
raise HTTPException(detail='Tokenization error')

return sentences

async def extract(request):
post_data = await request.post()
param_html = post_data.get('html', '')

if not param_html:
return abort('empty "html" parameter')
@router.post(
'/extract',
summary='Получение основного содержимого из html документа',
response_class=PlainTextResponse,
responses=extract_responses,
)
def extract(
html: str = Form(description='Содержимое HTML-страницы, закодированное с помощью `urlencode` функции'),
):

param_html = normalize_html(param_html)
html = normalize_html(html)

loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
executor=None,
func=lambda: trafilatura_extract(
param_html,
favor_precision=True,
include_comments=False,
config=TRAFILATURA_CONFIG,
),
text = trafilatura_extract(
html,
favor_precision=True,
include_comments=False,
config=TRAFILATURA_CONFIG,
)

if not result:
return abort('extract error')
if not text:
raise HTTPException(detail='Extraction error')

return Response(body=result, content_type='text/plain')
return text


async def deduplicate(request):
json_data = await request.json() # type: dict

if not json_data:
return abort('Not found `json` in request data')

sentences = json_data.get('sentences', None)
if not sentences:
return abort('Not found `sentences` in json')

if not isinstance(sentences, list):
return abort('Param `sentences` is not iterable: must be array of strings in json')

try:
threshold = float(json_data.get('threshold', 0.8))
except ValueError:
return abort('Param `threshold` is malformed: type is not float')

loop = asyncio.get_event_loop()
dedup_sentences = await loop.run_in_executor(
executor=None,
func=lambda: deduplicate_sentences(
sentences,
threshold=threshold,
),
)
@router.post(
'/deduplicate',
summary='Удаление нечётких дублей приложений',
responses=deduplicate_responses,
)
def deduplicate(
sentences: list[str] = Body(description='Массив предложений'),
threshold: float = Body(description='Пороговое значение', ge=0.0, le=1.0, default=0.8),
):
dedup_sentences = deduplicate_sentences(sentences, threshold)

if not dedup_sentences:
return abort('deduplication error')
raise HTTPException(detail='Deduplication error')

return json_response(dedup_sentences)
return dedup_sentences
Loading