Skip to content

Commit

Permalink
Merge pull request #4 from linto-ai/next
Browse files Browse the repository at this point in the history
merge next -> master
  • Loading branch information
Jeronymous authored Sep 23, 2024
2 parents ba08000 + 5b8d5e1 commit 2910b7c
Show file tree
Hide file tree
Showing 17 changed files with 636 additions and 142 deletions.
15 changes: 15 additions & 0 deletions .env_default
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# SERVING PARAMETERS
SERVICE_MODE=task

# SERVICE PARAMETERS
SERVICES_BROKER=redis://172.17.0.1:6379
BROKER_PASS=

# SERVICE DISCOVERY
SERVICE_NAME=linto-punctuation
LANGUAGE=fr-FR
# QUEUE_NAME=(Optionnal)
# MODEL_INFO=This model does something

# CONCURRENCY
CONCURRENCY=2
8 changes: 0 additions & 8 deletions .env_default_http

This file was deleted.

15 changes: 0 additions & 15 deletions .env_default_task

This file was deleted.

29 changes: 11 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
FROM python:3.8
LABEL maintainer="[email protected]"
ENV PYTHONUNBUFFERED TRUE
ENV IMAGE_NAME linto-platform-diarization
FROM python:3.9
LABEL maintainer="[email protected]"

RUN apt-get update \
&& apt-get install --no-install-recommends -y \
ca-certificates \
g++ \
openjdk-11-jre-headless \
curl \
wget
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
g++ \
curl \
libtinfo5 \
wget

# Rust compiler for tokenizers
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
Expand All @@ -18,8 +16,8 @@ ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /usr/src/app

# Python dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt -f https://download.pytorch.org/whl/torch_stable.html

# Supervisor
COPY celery_app /usr/src/app/celery_app
Expand All @@ -28,13 +26,8 @@ COPY document /usr/src/app/document
COPY punctuation /usr/src/app/punctuation
RUN mkdir /usr/src/app/model-store
RUN mkdir -p /usr/src/app/tmp
COPY config.properties /usr/src/app/config.properties
COPY RELEASE.md ./
COPY docker-entrypoint.sh wait-for-it.sh healthcheck.sh ./

# Grep CURRENT VERSION
RUN export VERSION=$(awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //')

ENV PYTHONPATH="${PYTHONPATH}:/usr/src/app/punctuation"
HEALTHCHECK CMD ./healthcheck.sh

Expand Down
35 changes: 35 additions & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM python:3.9
LABEL maintainer="[email protected]"

RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
g++ \
openjdk-11-jre-headless \
curl \
wget

# Rust compiler for tokenizers
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"

WORKDIR /usr/src/app

# Python dependencies
COPY requirements.cpu.txt .
RUN pip3 install --no-cache-dir -r requirements.cpu.txt -f https://download.pytorch.org/whl/torch_stable.html

# Supervisor
COPY celery_app /usr/src/app/celery_app
COPY http_server /usr/src/app/http_server
COPY document /usr/src/app/document
COPY punctuation /usr/src/app/punctuation
RUN mkdir /usr/src/app/model-store
RUN mkdir -p /usr/src/app/tmp
COPY docker-entrypoint.sh wait-for-it.sh healthcheck.sh ./

ENV PYTHONPATH="${PYTHONPATH}:/usr/src/app/punctuation"
HEALTHCHECK CMD ./healthcheck.sh

ENV TEMP=/usr/src/app/tmp
ENTRYPOINT ["./docker-entrypoint.sh"]
20 changes: 20 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,25 @@ pipeline {
}
}
}

stage('Docker build for recasepunc branch'){
when{
branch 'recasepunc'
}
steps {
echo 'Publishing recasepunc'
script {
image = docker.build(env.DOCKER_HUB_REPO, "-f Dockerfile .")
VERSION = sh(
returnStdout: true,
script: "awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //'"
).trim()
docker.withRegistry('https://registry.hub.docker.com', env.DOCKER_HUB_CRED) {
image.push('recasepunc-latest')
}
}
}
}

}// end stages
}
35 changes: 27 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,27 @@ LinTO-platform-punctuation can either be used as a standalone punctuation servic
## Pre-requisites

### Models
The punctuation service relies on a trained punctuation prediction model.
The punctuation service relies on a trained recasing and punctuation prediction model.

We provide homebrew models on [dl.linto.ai](https://dl.linto.ai/downloads/model-distribution/punctuation_models/).
Some models trained on [Common Crawl](http://data.statmt.org/cc-100/) are available on [recasepunc](https://github.com/benob/recasepunc) for the following the languages:
* French
* [fr-txt.large.19000](https://github.com/benob/recasepunc/releases/download/0.3/fr-txt.large.19000)
* [fr.22000](https://github.com/benob/recasepunc/releases/download/0.3/fr.22000)
* English
* [en.23000](https://github.com/benob/recasepunc/releases/download/0.3/en.23000)
* Italian
* [it.22000](https://github.com/CoffeePerry/recasepunc/releases/download/v0.1.0/it.22000)
* Chinese
* [zh.24000](https://github.com/benob/recasepunc/releases/download/0.3/zh.24000)

<!-- We provide homebrew models on [dl.linto.ai](https://dl.linto.ai/downloads/model-distribution/punctuation_models/). -->

### Docker
The punctuation service requires docker up and running.

For GPU capabilities, it is also needed to install
[nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).

### (micro-service) Service broker
The punctuation only entry point in job mode are tasks posted on a REDIS message broker using [Celery](https://github.com/celery/celery).

Expand All @@ -52,13 +66,13 @@ docker pull registry.linto.ai/lintoai/linto-platform-punctuation:latest

**2- Download the models**

Have the punctuation model (.mar) ready at MODEL_PATH.
Have the punctuation model ready at `<MODEL_PATH>`.

### HTTP

**1- Fill the .env**
```bash
cp .env_default_http .env
cp .env_default .env
```

Fill the .env with your values.
Expand All @@ -73,12 +87,14 @@ Fill the .env with your values.

```bash
docker run --rm \
-v MODEL_PATH:/usr/src/app/model-store/punctuation.mar \
-v <MODEL_PATH>:/usr/src/app/model-store/model \
-p HOST_SERVING_PORT:80 \
--env-file .env \
linto-platform-punctuation:latest
```

Also add ```--gpus all``` as an option to enable GPU capabilities.

This will run a container providing an http API binded on the host HOST_SERVING_PORT port.


Expand All @@ -90,7 +106,7 @@ You need a message broker up and running at MY_SERVICE_BROKER. Instance are typi

**1- Fill the .env**
```bash
cp .env_default_task .env
cp .env_default .env
```

Fill the .env with your values.
Expand Down Expand Up @@ -118,7 +134,7 @@ services:
punctuation-service:
image: linto-platform-punctuation:latest
volumes:
- /my/path/to/models/punctuation.mar:/usr/src/app/model-store/punctuation.mar
- /my/path/to/models/punctuation.mar:/usr/src/app/model-store/model
env_file: .env
deploy:
replicas: 1
Expand Down Expand Up @@ -156,7 +172,7 @@ The following information are registered:
"service_language": $LANGUAGE,
"queue_name": $QUEUE_NAME,
"version": "1.2.0", # This repository's version
"info": "Bert Based Punctuation model for french punctuation prediction",
"info": "Punctuation model for french punctuation prediction",
"last_alive": 65478213,
"concurrency": 1
}
Expand Down Expand Up @@ -223,3 +239,6 @@ curl -X POST "http://YOUR_SERVICE:YOUR_PORT/punctuation" -H "accept: applicatio

## License
This project is developped under the AGPLv3 License (see LICENSE).

## Acknowledgments
* [recasepunc](https://github.com/benob/recasepunc) Python library to train recasing and punctuation models, and to apply them (License BSD 3).
3 changes: 3 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# 2.0.0
- Integration of recasepunc

# 1.1.1
- Fix error on empty sentences

Expand Down
5 changes: 4 additions & 1 deletion celery_app/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def register(is_heartbeat: bool = False) -> bool:
"""
host, port = os.environ.get("SERVICES_BROKER").split("//")[1].split(":")
password = os.environ.get("BROKER_PASS", None)
if not password: password = None
r = redis.Redis(
host=host, port=int(port), db=SERVICE_DISCOVERY_DB, password=password
)
Expand Down Expand Up @@ -59,8 +60,10 @@ def unregister() -> None:
"""Un-register the service"""
try:
host, port = os.environ.get("SERVICES_BROKER").split("//")[1].split(":")
password = os.environ.get("BROKER_PASS", None)
if not password: password = None
r = redis.Redis(
host=host, port=int(port), db=SERVICE_DISCOVERY_DB, password="password"
host=host, port=int(port), db=SERVICE_DISCOVERY_DB, password=password
)
r.json().delete(f"service:{host_name}")
except Exception as error:
Expand Down
43 changes: 8 additions & 35 deletions celery_app/tasks.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,26 @@
import json
from typing import Union

import requests

from celery_app.celeryapp import celery

from punctuation.recasepunc import load_model, generate_predictions

MODEL = load_model()

@celery.task(name="punctuation_task", bind=True)
def punctuation_task(self, text: Union[str, list]):
"""punctuation_task do a synchronous call to the punctuation serving API"""
self.update_state(state="STARTED")
# Fetch model name
try:
result = requests.get(
"http://localhost:8081/models",
headers={
"accept": "application/json",
},
)
models = json.loads(result.text)
model_name = models["models"][0]["modelName"]
except Exception as error:
raise Exception("Failed to fetch model name") from error

unique = isinstance(text, str)

if isinstance(text, str):
if unique:
sentences = [text]
else:
sentences = text
punctuated_sentences = []
for i, sentence in enumerate(sentences):
self.update_state(state="STARTED", meta={"current": i, "total": len(sentences)})

result = requests.post(
f"http://localhost:8080/predictions/{model_name}",
headers={"content-type": "application/octet-stream"},
data=sentence.strip().encode("utf-8"),
)
if result.status_code == 200:
punctuated_sentence = result.text
else:
print("Failed to predict punctuation on sentence: >{sentence}<")
punctuated_sentence = sentence
# First letter in capital
if len(punctuated_sentence):
punctuated_sentence = punctuated_sentence[0].upper() + punctuated_sentence[1:]
punctuated_sentences.append(punctuated_sentence)
punctuated_sentences = generate_predictions(MODEL, sentences)

return (
punctuated_sentences[0]
if len(punctuated_sentences) == 1
if unique
else punctuated_sentences
)
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ services:
punctuation-service:
image: linto-platform-punctuation:latest
volumes:
- /path/to/your/model.mar/usr/src/app/model-store/punctuation.mar
- /path/to/your/model.mar/usr/src/app/model-store/model
env_file: .env
deploy:
replicas: 1
Expand Down
13 changes: 4 additions & 9 deletions docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@

echo "RUNNING service"

export VERSION=$(awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //')

if [ -z "$SERVICE_MODE" ]
then
echo "ERROR: Must specify a serving mode: [ http | task ]"
exit -1
else
# Model serving
torchserve --start --ncs --ts-config /usr/src/app/config.properties
if [ "$SERVICE_MODE" = "http" ]
then
echo "Running http server"
Expand All @@ -19,25 +15,24 @@ else
elif [ "$SERVICE_MODE" == "task" ]
then
echo "Running celery worker"
/usr/src/app/wait-for-it.sh $(echo $SERVICES_BROKER | cut -d'/' -f 3) --timeout=20 --strict -- echo " $SERVICES_BROKER (Service Broker) is up"
/usr/src/app/wait-for-it.sh $(echo $SERVICES_BROKER | cut -d'/' -f 3) --timeout=20 --strict -- echo " $SERVICES_BROKER (Service Broker) is up" || exit $?
# MICRO SERVICE
## QUEUE NAME
QUEUE=$(python -c "from celery_app.register import queue; exit(queue())" 2>&1)
echo "Service set to $QUEUE"

## REGISTRATION
python -c "from celery_app.register import register; register()"
python -c "from celery_app.register import register; register()" # || exit $?
echo "Service registered"

## WORKER
celery --app=celery_app.celeryapp worker -n punctuation_$SERVICE_NAME@%h --queues=$QUEUE -c $CONCURRENCY
celery --app=celery_app.celeryapp worker --pool=solo -n punctuation_$SERVICE_NAME@%h --queues=$QUEUE -c $CONCURRENCY

## UNREGISTERING
python -c "from celery_app.register import unregister; unregister()"
python -c "from celery_app.register import unregister; unregister()" || exit $?
echo "Service unregistered"
else
echo "ERROR: Wrong serving command: $SERVICE_MODE"
exit -1
fi
torchserve --stop
fi
Loading

0 comments on commit 2910b7c

Please sign in to comment.