Merge pull request #244 from ahmetoner/upgrade-whisper-v20240930

Upgrade OpenAI Whisper to v20240930 (turbo)
ahmetoner · Oct 6, 2024 · b3daab8 · b3daab8 · easonwanger · Oct 9, 2024
2 parents 005c7e9 + a6169fb
commit b3daab8
Show file tree

Hide file tree

Showing 10 changed files with 72 additions and 386 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,15 @@ Changelog
 Unreleased
 ----------
 
+### Changed
+
+- Upgraded
+  - [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
+  - fastapi to v0.115.0
+  - uvicorn to v0.31.0
+  - tqdm to v4.66.5
+  - python-multipart to v0.0.12
+
 [1.5.0] (2024-07-04)
 --------------------
 

diff --git a/Dockerfile b/Dockerfile
@@ -63,4 +63,4 @@ RUN poetry install
 
 EXPOSE 9000
 
-ENTRYPOINT ["gunicorn", "--bind", "0.0.0.0:9000", "--workers", "1", "--timeout", "0", "app.webservice:app", "-k", "uvicorn.workers.UvicornWorker"]
+ENTRYPOINT ["whisper-asr-webservice"]
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -81,4 +81,4 @@ RUN $POETRY_VENV/bin/pip install torch==1.13.1+cu117 -f https://download.pytorch
 
 EXPOSE 9000
 
-CMD gunicorn --bind 0.0.0.0:9000 --workers 1 --timeout 0 app.webservice:app -k uvicorn.workers.UvicornWorker
+CMD whisper-asr-webservice
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Whisper is a general-purpose speech recognition model. It is trained on a large
 
 Current release (v1.5.0) supports following whisper models:
 
-- [openai/whisper](https://github.com/openai/whisper)@[v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
+- [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
 - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/1.0.3)
 
 ## Quick Usage

diff --git a/app/webservice.py b/app/webservice.py
@@ -1,11 +1,13 @@
 import importlib.metadata
 import os
 from os import path
-from typing import Annotated, BinaryIO, Union
+from typing import Annotated, BinaryIO, Optional, Union
 from urllib.parse import quote
 
+import click
 import ffmpeg
 import numpy as np
+import uvicorn
 from fastapi import FastAPI, File, Query, UploadFile, applications
 from fastapi.openapi.docs import get_swagger_ui_html
 from fastapi.responses import RedirectResponse, StreamingResponse
@@ -14,9 +16,9 @@
 
 ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
 if ASR_ENGINE == "faster_whisper":
-    from .faster_whisper.core import language_detection, transcribe
+    from app.faster_whisper.core import language_detection, transcribe
 else:
-    from .openai_whisper.core import language_detection, transcribe
+    from app.openai_whisper.core import language_detection, transcribe
 
 SAMPLE_RATE = 16000
 LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys())
@@ -122,3 +124,28 @@ def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
         out = file.read()
 
     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+@click.command()
+@click.option(
+    "-h",
+    "--host",
+    metavar="HOST",
+    default="0.0.0.0",
+    help="Host for the webservice (default: 0.0.0.0)",
+)
+@click.option(
+    "-p",
+    "--port",
+    metavar="PORT",
+    default=9000,
+    help="Port for the webservice (default: 9000)",
+)
+@click.version_option(version=projectMetadata["Version"])
+def start(
+    host: str,
+    port: Optional[int] = None
+):
+    uvicorn.run(app, host=host, port=port)
+
+if __name__ == "__main__":
+    start()
diff --git a/docs/build.md b/docs/build.md
@@ -24,7 +24,7 @@ poetry install
 Starting the Webservice:
 
 ```sh
-poetry run gunicorn --bind 0.0.0.0:9000 --workers 1 --timeout 0 app.webservice:app -k uvicorn.workers.UvicornWorker
+poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
 ```
 
 ### Build

diff --git a/docs/environmental-variables.md b/docs/environmental-variables.md
@@ -15,7 +15,7 @@
 export ASR_MODEL=base
 ```
 
-Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large` (only OpenAI Whisper), `large-v1`, `large-v2` and `large-v3`.
+Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo`(only OpenAI Whisper) and `large-v3-turbo`(only OpenAI Whisper).
 
 For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
 

diff --git a/docs/index.md b/docs/index.md
@@ -4,7 +4,7 @@ Whisper is a general-purpose speech recognition model. It is trained on a large
 
 Current release (v1.5.0) supports following whisper models:
 
-- [openai/whisper](https://github.com/openai/whisper)@[v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
+- [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
 - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/1.0.3)
 
 ## Quick Usage