From 87532a52d629ea07763fd1fd17fbac9b7d55eabc Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Thu, 25 Jul 2024 13:35:14 +0000
Subject: [PATCH 1/9] Added endpoints, deployments, utils, storage

---
 aana_chat_with_video/alembic.ini              | 116 ++++++++++
 aana_chat_with_video/alembic/__init__.py      |   0
 aana_chat_with_video/alembic/env.py           |  85 ++++++++
 aana_chat_with_video/alembic/script.py.mako   |  28 +++
 .../alembic/versions/7da69bf375e7_init.py     | 111 ++++++++++
 aana_chat_with_video/app.py                   |   1 -
 aana_chat_with_video/configs/deployments.py   |  97 ++++++---
 aana_chat_with_video/configs/endpoints.py     |  53 +++--
 aana_chat_with_video/configs/settings.py      |   9 +-
 .../endpoints/delete_video.py                 |  25 +++
 .../endpoints/get_video_status.py             |  26 +++
 aana_chat_with_video/endpoints/index_video.py | 198 ++++++++++++++++++
 .../endpoints/load_video_metadata.py          |  26 +++
 aana_chat_with_video/endpoints/video_chat.py  |  82 ++++++++
 aana_chat_with_video/storage/__init__.py      |   0
 .../storage/models/__init__.py                |  18 ++
 .../storage/models/extended_video.py          |  53 +++++
 .../storage/models/extended_video_caption.py  |  52 +++++
 .../models/extended_video_transcript.py       |  55 +++++
 .../storage/repository/__init__.py            |   0
 .../storage/repository/extended_video.py      |  77 +++++++
 .../repository/extended_video_caption.py      | 103 +++++++++
 .../repository/extended_video_transcript.py   |  84 ++++++++
 aana_chat_with_video/utils/core.py            | 128 +++++++++++
 24 files changed, 1381 insertions(+), 46 deletions(-)
 create mode 100644 aana_chat_with_video/alembic.ini
 create mode 100644 aana_chat_with_video/alembic/__init__.py
 create mode 100644 aana_chat_with_video/alembic/env.py
 create mode 100644 aana_chat_with_video/alembic/script.py.mako
 create mode 100644 aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
 create mode 100644 aana_chat_with_video/endpoints/delete_video.py
 create mode 100644 aana_chat_with_video/endpoints/get_video_status.py
 create mode 100644 aana_chat_with_video/endpoints/index_video.py
 create mode 100644 aana_chat_with_video/endpoints/load_video_metadata.py
 create mode 100644 aana_chat_with_video/endpoints/video_chat.py
 create mode 100644 aana_chat_with_video/storage/__init__.py
 create mode 100644 aana_chat_with_video/storage/models/__init__.py
 create mode 100644 aana_chat_with_video/storage/models/extended_video.py
 create mode 100644 aana_chat_with_video/storage/models/extended_video_caption.py
 create mode 100644 aana_chat_with_video/storage/models/extended_video_transcript.py
 create mode 100644 aana_chat_with_video/storage/repository/__init__.py
 create mode 100644 aana_chat_with_video/storage/repository/extended_video.py
 create mode 100644 aana_chat_with_video/storage/repository/extended_video_caption.py
 create mode 100644 aana_chat_with_video/storage/repository/extended_video_transcript.py

diff --git a/aana_chat_with_video/alembic.ini b/aana_chat_with_video/alembic.ini
new file mode 100644
index 0000000..7cba320
--- /dev/null
+++ b/aana_chat_with_video/alembic.ini
@@ -0,0 +1,116 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python-dateutil library that can be
+# installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+# sqlalchemy.url = driver://user:pass@localhost/dbname
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+hooks = ruff
+ruff.type = exec
+ruff.executable = ruff
+ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/aana_chat_with_video/alembic/__init__.py b/aana_chat_with_video/alembic/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/aana_chat_with_video/alembic/env.py b/aana_chat_with_video/alembic/env.py
new file mode 100644
index 0000000..9d7ffc7
--- /dev/null
+++ b/aana_chat_with_video/alembic/env.py
@@ -0,0 +1,85 @@
+from logging.config import fileConfig
+
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+
+from aana.configs.settings import settings
+from aana.storage.models.base import BaseEntity
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+
+# Import all models to be included in the migration
+import aana.storage.models  # noqa: F401
+import aana_chat_with_video.storage.models  # noqa: F401
+
+target_metadata = BaseEntity.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    Modified to use our existing db config module.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    engine = settings.db_config.get_engine()
+    context.configure(
+        url=engine.url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+        render_as_batched=True,
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    config_section = config.get_section(config.config_ini_section, {})
+    engine = settings.db_config.get_engine()
+    config_section["sqlalchemy.url"] = engine.url
+    connectable = engine_from_config(
+        config_section,
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection, target_metadata=target_metadata, render_as_batch=True
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/aana_chat_with_video/alembic/script.py.mako b/aana_chat_with_video/alembic/script.py.mako
new file mode 100644
index 0000000..02849b8
--- /dev/null
+++ b/aana_chat_with_video/alembic/script.py.mako
@@ -0,0 +1,28 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: str | None = ${repr(down_revision)}
+branch_labels: str | Sequence[str] | None = ${repr(branch_labels)}
+depends_on: str | Sequence[str] | None = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    """Upgrade database to this revision from previous."""
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    """Downgrade database from this revision to previous."""
+    ${downgrades if downgrades else "pass"}
diff --git a/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py b/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
new file mode 100644
index 0000000..277b042
--- /dev/null
+++ b/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
@@ -0,0 +1,111 @@
+"""init
+
+Revision ID: 7da69bf375e7
+Revises: 
+Create Date: 2024-07-25 13:28:12.907560
+
+"""
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = '7da69bf375e7'
+down_revision: str | None = None
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    """Upgrade database to this revision from previous."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('caption',
+    sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('model', sa.String(), nullable=False, comment='Name of model used to generate the caption'),
+    sa.Column('frame_id', sa.Integer(), nullable=False, comment='The 0-based frame id of video for caption'),
+    sa.Column('caption', sa.String(), nullable=False, comment='Frame caption'),
+    sa.Column('timestamp', sa.Float(), nullable=False, comment='Frame timestamp in seconds'),
+    sa.Column('caption_type', sa.String(), nullable=False, comment='The type of caption'),
+    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
+    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_caption'))
+    )
+    op.create_table('media',
+    sa.Column('id', sa.String(length=36), nullable=False, comment='Unique identifier for the media'),
+    sa.Column('media_type', sa.String(), nullable=False, comment='The type of media'),
+    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
+    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_media'))
+    )
+    op.create_table('tasks',
+    sa.Column('id', sa.UUID(), nullable=False, comment='Task ID'),
+    sa.Column('endpoint', sa.String(), nullable=False, comment='The endpoint to which the task is assigned'),
+    sa.Column('data', sa.PickleType(), nullable=False, comment='Data for the task'),
+    sa.Column('status', sa.Enum('CREATED', 'ASSIGNED', 'COMPLETED', 'RUNNING', 'FAILED', 'NOT_FINISHED', name='status'), nullable=False, comment='Status of the task'),
+    sa.Column('priority', sa.Integer(), nullable=False, comment='Priority of the task (0 is the lowest)'),
+    sa.Column('assigned_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True, comment='Timestamp when the task was assigned'),
+    sa.Column('completed_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True, comment='Timestamp when the task was completed'),
+    sa.Column('progress', sa.Float(), nullable=False, comment='Progress of the task in percentage'),
+    sa.Column('result', sa.JSON(), nullable=True, comment='Result of the task in JSON format'),
+    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
+    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_tasks'))
+    )
+    op.create_table('transcript',
+    sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('model', sa.String(), nullable=False, comment='Name of model used to generate transcript'),
+    sa.Column('transcript', sa.String(), nullable=False, comment='Full text transcript of media'),
+    sa.Column('segments', sa.JSON(), nullable=False, comment='Segments of the transcript'),
+    sa.Column('language', sa.String(), nullable=False, comment='Language of the transcript as predicted by model'),
+    sa.Column('language_confidence', sa.Float(), nullable=False, comment='Confidence score of language prediction'),
+    sa.Column('transcript_type', sa.String(), nullable=False, comment='The type of transcript'),
+    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
+    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_transcript'))
+    )
+    op.create_table('video',
+    sa.Column('id', sa.String(length=36), nullable=False),
+    sa.Column('path', sa.String(), nullable=True, comment='Path'),
+    sa.Column('url', sa.String(), nullable=True, comment='URL'),
+    sa.Column('title', sa.String(), nullable=True, comment='Title'),
+    sa.Column('description', sa.String(), nullable=True, comment='Description'),
+    sa.ForeignKeyConstraint(['id'], ['media.id'], name=op.f('fk_video_id_media')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_video'))
+    )
+    op.create_table('extended_video',
+    sa.Column('id', sa.String(length=36), nullable=False),
+    sa.Column('duration', sa.Float(), nullable=True, comment='Video duration in seconds'),
+    sa.Column('status', sa.Enum('CREATED', 'RUNNING', 'COMPLETED', 'FAILED', name='videoprocessingstatus'), nullable=False, comment='Processing status'),
+    sa.ForeignKeyConstraint(['id'], ['video.id'], name=op.f('fk_extended_video_id_video')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video'))
+    )
+    op.create_table('extended_video_caption',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'),
+    sa.ForeignKeyConstraint(['id'], ['caption.id'], name=op.f('fk_extended_video_caption_id_caption')),
+    sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_caption_media_id_extended_video')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_caption'))
+    )
+    op.create_table('extended_video_transcript',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'),
+    sa.ForeignKeyConstraint(['id'], ['transcript.id'], name=op.f('fk_extended_video_transcript_id_transcript')),
+    sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_transcript_media_id_extended_video')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_transcript'))
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade database from this revision to previous."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('extended_video_transcript')
+    op.drop_table('extended_video_caption')
+    op.drop_table('extended_video')
+    op.drop_table('video')
+    op.drop_table('transcript')
+    op.drop_table('tasks')
+    op.drop_table('media')
+    op.drop_table('caption')
+    # ### end Alembic commands ###
diff --git a/aana_chat_with_video/app.py b/aana_chat_with_video/app.py
index e6e2a83..3ae7ccc 100644
--- a/aana_chat_with_video/app.py
+++ b/aana_chat_with_video/app.py
@@ -1,5 +1,4 @@
 from aana.sdk import AanaSDK
-
 from aana_chat_with_video.configs.deployments import deployments
 from aana_chat_with_video.configs.endpoints import endpoints
 
diff --git a/aana_chat_with_video/configs/deployments.py b/aana_chat_with_video/configs/deployments.py
index 3020f53..444c9cd 100644
--- a/aana_chat_with_video/configs/deployments.py
+++ b/aana_chat_with_video/configs/deployments.py
@@ -1,25 +1,74 @@
-deployments: list[dict] = []
+from aana.core.models.sampling import SamplingParams
+from aana.core.models.types import Dtype
+from aana.deployments.hf_blip2_deployment import HFBlip2Config, HFBlip2Deployment
+from aana.deployments.vad_deployment import VadConfig, VadDeployment
+from aana.deployments.vllm_deployment import VLLMConfig, VLLMDeployment
+from aana.deployments.whisper_deployment import (
+    WhisperComputeType,
+    WhisperConfig,
+    WhisperDeployment,
+    WhisperModelSize,
+)
 
-# Add deployments for models that you want to deploy here.
-#
-# For example:
-# from aana.deployments.whisper_deployment import (
-#     WhisperComputeType,
-#     WhisperConfig,
-#     WhisperDeployment,
-#     WhisperModelSize,
-# )
-# asr_deployment = WhisperDeployment.options(
-#     num_replicas=1,
-#     ray_actor_options={"num_gpus": 0.1},
-#     user_config=WhisperConfig(
-#         model_size=WhisperModelSize.MEDIUM,
-#         compute_type=WhisperComputeType.FLOAT16,
-#     ).model_dump(mode="json"),
-# )
-# deployments.append({"name": "asr_deployment", "instance": asr_deployment})
-#
-# You can use predefined deployments from the Aana SDK or create your own.
-# See https://github.com/mobiusml/aana_sdk/blob/main/docs/integrations.md for the list of predefined deployments.
-#
-# If you want to create your own deployment, put your deployment classes in a separate files in the `deployments` directory and import them here.
+deployments: list[dict] = [
+    {
+        "name": "asr_deployment",
+        "instance": WhisperDeployment.options(
+            num_replicas=1,
+            max_ongoing_requests=1000,
+            ray_actor_options={"num_gpus": 0.25},
+            user_config=WhisperConfig(
+                model_size=WhisperModelSize.MEDIUM,
+                compute_type=WhisperComputeType.FLOAT16,
+            ).model_dump(mode="json"),
+        ),
+    },
+    {
+        "name": "vad_deployment",
+        "instance": VadDeployment.options(
+            num_replicas=1,
+            max_ongoing_requests=1000,
+            ray_actor_options={"num_gpus": 0.05},
+            user_config=VadConfig(
+                model=(
+                    "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/"
+                    "0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin"
+                ),
+                onset=0.5,
+                sample_rate=16000,
+            ).model_dump(mode="json"),
+        ),
+    },
+    {
+        "name": "captioning_deployment",
+        "instance": HFBlip2Deployment.options(
+            num_replicas=1,
+            max_ongoing_requests=1000,
+            ray_actor_options={"num_gpus": 0.25},
+            user_config=HFBlip2Config(
+                model="Salesforce/blip2-opt-2.7b",
+                dtype=Dtype.FLOAT16,
+                batch_size=2,
+                num_processing_threads=2,
+            ).model_dump(mode="json"),
+        ),
+    },
+    {
+        "name": "llm_deployment",
+        "instance": VLLMDeployment.options(
+            num_replicas=1,
+            ray_actor_options={"num_gpus": 0.45},
+            user_config=VLLMConfig(
+                model="internlm/internlm2_5-7b-chat",
+                dtype=Dtype.AUTO,
+                gpu_memory_reserved=30000,
+                max_model_len=50000,
+                enforce_eager=True,
+                default_sampling_params=SamplingParams(
+                    temperature=0.0, top_p=1.0, top_k=-1, max_tokens=1024
+                ),
+                engine_args={"trust_remote_code": True},
+            ).model_dump(mode="json"),
+        ),
+    },
+]
diff --git a/aana_chat_with_video/configs/endpoints.py b/aana_chat_with_video/configs/endpoints.py
index c0bf4e0..66afde9 100644
--- a/aana_chat_with_video/configs/endpoints.py
+++ b/aana_chat_with_video/configs/endpoints.py
@@ -1,17 +1,38 @@
-endpoints: list[dict] = []
+from aana_chat_with_video.endpoints.delete_video import DeleteVideoEndpoint
+from aana_chat_with_video.endpoints.get_video_status import GetVideoStatusEndpoint
+from aana_chat_with_video.endpoints.index_video import IndexVideoEndpoint
+from aana_chat_with_video.endpoints.load_video_metadata import LoadVideoMetadataEndpoint
+from aana_chat_with_video.endpoints.video_chat import VideoChatEndpoint
 
-# Add your endpoints here.
-#
-# For example:
-# endpoints.append(
-#     {
-#         "name": "predict",
-#         "path": "/predict",
-#         "summary": "Predict the class of an image.",
-#         "endpoint_cls": PredictEndpoint,
-#     }
-# )
-#
-# Endpoints can be created by inheriting from the `Endpoint` class.
-# Put your endpoint classes in a separate files in the `endpoints` directory and import them here.
-# See https://github.com/mobiusml/aana_sdk/tree/main?tab=readme-ov-file#endpoints in how to create endpoints.
+endpoints: list[dict] = [
+    {
+        "name": "index_video_stream",
+        "path": "/video/index_stream",
+        "summary": "Index a video and return the captions and transcriptions (streaming)",
+        "endpoint_cls": IndexVideoEndpoint,
+    },
+    {
+        "name": "video_metadata",
+        "path": "/video/metadata",
+        "summary": "Load video metadata",
+        "endpoint_cls": LoadVideoMetadataEndpoint,
+    },
+    {
+        "name": "video_chat_stream",
+        "path": "/video/chat_stream",
+        "summary": "Chat with video (streaming)",
+        "endpoint_cls": VideoChatEndpoint,
+    },
+    {
+        "name": "video_status",
+        "path": "/video/status",
+        "summary": "Get video status",
+        "endpoint_cls": GetVideoStatusEndpoint,
+    },
+    {
+        "name": "delete_media",
+        "path": "/video/delete",
+        "summary": "Delete video",
+        "endpoint_cls": DeleteVideoEndpoint,
+    },
+]
diff --git a/aana_chat_with_video/configs/settings.py b/aana_chat_with_video/configs/settings.py
index 01d473c..cae097c 100644
--- a/aana_chat_with_video/configs/settings.py
+++ b/aana_chat_with_video/configs/settings.py
@@ -3,11 +3,10 @@
 
 class Settings(AanaSettings):
     """A pydantic model for App settings."""
-    # Add your custom settings here
-    # Then, you can access them in your app like this:
-    # from aana_chat_with_video.configs.settings import settings
-    # settings.custom_property
-    pass
+
+    asr_model_name: str = "whisper_medium"
+    captioning_model_name: str = "hf_blip2_opt_2_7b"
+    max_video_len: int = 60 * 20  # 20 minutes
 
 
 settings = Settings()
diff --git a/aana_chat_with_video/endpoints/delete_video.py b/aana_chat_with_video/endpoints/delete_video.py
new file mode 100644
index 0000000..09f602d
--- /dev/null
+++ b/aana_chat_with_video/endpoints/delete_video.py
@@ -0,0 +1,25 @@
+from typing import TypedDict
+
+from aana.api.api_generation import Endpoint
+from aana.core.models.media import MediaId
+from aana.storage.repository.extended_video import ExtendedVideoRepository
+
+
+class DeleteVideoOutput(TypedDict):
+    """The output of the delete media endpoint."""
+
+    media_id: MediaId
+
+
+class DeleteVideoEndpoint(Endpoint):
+    """Delete video endpoint."""
+
+    async def initialize(self):
+        """Initialize the endpoint."""
+        await super().initialize()
+        self.video_repo = ExtendedVideoRepository(self.session)
+
+    async def run(self, media_id: MediaId) -> DeleteVideoOutput:
+        """Delete media."""
+        self.video_repo.delete(media_id)
+        return DeleteVideoOutput(media_id=media_id)
diff --git a/aana_chat_with_video/endpoints/get_video_status.py b/aana_chat_with_video/endpoints/get_video_status.py
new file mode 100644
index 0000000..4bb3566
--- /dev/null
+++ b/aana_chat_with_video/endpoints/get_video_status.py
@@ -0,0 +1,26 @@
+from typing import TypedDict
+
+from aana.api.api_generation import Endpoint
+from aana.core.models.media import MediaId
+from aana.core.models.video import VideoStatus
+from aana.storage.repository.extended_video import ExtendedVideoRepository
+
+
+class VideoStatusOutput(TypedDict):
+    """The output of the video status endpoint."""
+
+    status: VideoStatus
+
+
+class GetVideoStatusEndpoint(Endpoint):
+    """Get video status endpoint."""
+
+    async def initialize(self):
+        """Initialize the endpoint."""
+        await super().initialize()
+        self.video_repo = ExtendedVideoRepository(self.session)
+
+    async def run(self, media_id: MediaId) -> VideoStatusOutput:
+        """Load video metadata."""
+        video_status = self.video_repo.get_status(media_id)
+        return VideoStatusOutput(status=video_status)
diff --git a/aana_chat_with_video/endpoints/index_video.py b/aana_chat_with_video/endpoints/index_video.py
new file mode 100644
index 0000000..314fe00
--- /dev/null
+++ b/aana_chat_with_video/endpoints/index_video.py
@@ -0,0 +1,198 @@
+from collections.abc import AsyncGenerator
+from typing import TYPE_CHECKING, Annotated, TypedDict
+
+from pydantic import Field
+
+from aana.api.api_generation import Endpoint
+from aana.core.models.asr import (
+    AsrSegments,
+    AsrTranscription,
+    AsrTranscriptionInfo,
+)
+from aana.core.models.media import MediaId
+from aana.core.models.vad import VadParams
+from aana.core.models.video import VideoInput, VideoMetadata, VideoParams
+from aana.core.models.whisper import BatchedWhisperParams
+from aana.deployments.aana_deployment_handle import AanaDeploymentHandle
+from aana.exceptions.db import MediaIdAlreadyExistsException
+from aana.exceptions.io import VideoTooLongException
+from aana.integrations.external.decord import generate_frames, get_video_duration
+from aana.integrations.external.yt_dlp import download_video, get_video_metadata
+from aana.processors.remote import run_remote
+from aana.processors.video import extract_audio
+from aana.storage.models.extended_video import VideoProcessingStatus
+from aana.storage.repository.extended_video import ExtendedVideoRepository
+from aana.storage.repository.extended_video_caption import (
+    ExtendedVideoCaptionRepository,
+)
+from aana.storage.repository.extended_video_transcript import (
+    ExtendedVideoTranscriptRepository,
+)
+from aana_chat_with_video.configs.settings import settings
+
+if TYPE_CHECKING:
+    from aana.core.models.audio import Audio
+    from aana.core.models.video import Video
+
+
+class IndexVideoOutput(TypedDict):
+    """The output of the transcribe video endpoint."""
+
+    media_id: MediaId
+    metadata: VideoMetadata
+    transcription: AsrTranscription
+    transcription_info: AsrTranscriptionInfo
+    segments: AsrSegments
+
+    captions: Annotated[list[str], Field(..., description="Captions")]
+    timestamps: Annotated[
+        list[float], Field(..., description="Timestamps for each caption in seconds")
+    ]
+
+    transcription_id: Annotated[int, Field(..., description="Transcription Id")]
+    caption_ids: Annotated[list[int], Field(..., description="Caption Ids")]
+
+
+class IndexVideoEndpoint(Endpoint):
+    """Transcribe video in chunks endpoint."""
+
+    async def initialize(self):
+        """Initialize the endpoint."""
+        await super().initialize()
+        self.asr_handle = await AanaDeploymentHandle.create("asr_deployment")
+        self.vad_handle = await AanaDeploymentHandle.create("vad_deployment")
+        self.captioning_handle = await AanaDeploymentHandle.create(
+            "captioning_deployment"
+        )
+        self.extended_video_repo = ExtendedVideoRepository(self.session)
+        self.transcript_repo = ExtendedVideoTranscriptRepository(self.session)
+        self.caption_repo = ExtendedVideoCaptionRepository(self.session)
+
+    async def run(  # noqa: C901
+        self,
+        video: VideoInput,
+        video_params: VideoParams,
+        whisper_params: BatchedWhisperParams,
+        vad_params: VadParams,
+    ) -> AsyncGenerator[IndexVideoOutput, None]:
+        """Transcribe video in chunks."""
+        media_id = video.media_id
+        if self.extended_video_repo.check_media_exists(media_id):
+            raise MediaIdAlreadyExistsException(table_name="media", media_id=video)
+
+        video_duration = None
+        if video.url is not None:
+            video_metadata = get_video_metadata(video.url)
+            video_duration = video_metadata.duration
+
+        # precheck for max video length before actually download the video if possible
+        if video_duration and video_duration > settings.max_video_len:
+            raise VideoTooLongException(
+                video=video,
+                video_len=video_duration,
+                max_len=settings.max_video_len,
+            )
+
+        video_obj: Video = await run_remote(download_video)(video_input=video)
+        if video_duration is None:
+            video_duration = await run_remote(get_video_duration)(video=video_obj)
+
+        if video_duration > settings.max_video_len:
+            raise VideoTooLongException(
+                video=video_obj,
+                video_len=video_duration,
+                max_len=settings.max_video_len,
+            )
+
+        self.extended_video_repo.save(video=video_obj, duration=video_duration)
+        yield {
+            "media_id": media_id,
+            "metadata": VideoMetadata(
+                title=video_obj.title,
+                description=video_obj.description,
+                duration=video_duration,
+            ),
+        }
+
+        try:
+            self.extended_video_repo.update_status(
+                media_id, VideoProcessingStatus.RUNNING
+            )
+            audio: Audio = extract_audio(video=video_obj)
+
+            # TODO: Update once batched whisper PR is merged
+            # vad_output = await self.vad_handle.asr_preprocess_vad(
+            #     audio=audio, params=vad_params
+            # )
+            # vad_segments = vad_output["segments"]
+
+            transcription_list = []
+            segments_list = []
+            transcription_info_list = []
+            async for whisper_output in self.asr_handle.transcribe_stream(
+                audio=audio, params=whisper_params
+            ):
+                transcription_list.append(whisper_output["transcription"])
+                segments_list.append(whisper_output["segments"])
+                transcription_info_list.append(whisper_output["transcription_info"])
+                yield {
+                    "transcription": whisper_output["transcription"],
+                    "segments": whisper_output["segments"],
+                    "info": whisper_output["transcription_info"],
+                }
+            transcription = sum(transcription_list, AsrTranscription())
+            segments = sum(segments_list, AsrSegments())
+            transcription_info = sum(transcription_info_list, AsrTranscriptionInfo())
+
+            captions = []
+            timestamps = []
+            frame_ids = []
+
+            async for frames_dict in run_remote(generate_frames)(
+                video=video_obj, params=video_params
+            ):
+                if len(frames_dict["frames"]) == 0:
+                    break
+
+                timestamps.extend(frames_dict["timestamps"])
+                frame_ids.extend(frames_dict["frame_ids"])
+
+                captioning_output = await self.captioning_handle.generate_batch(
+                    images=frames_dict["frames"]
+                )
+                captions.extend(captioning_output["captions"])
+
+                yield {
+                    "captions": captioning_output["captions"],
+                    "timestamps": frames_dict["timestamps"],
+                }
+
+            transcription_entity = self.transcript_repo.save(
+                model_name=settings.asr_model_name,
+                media_id=video_obj.media_id,
+                transcription=transcription,
+                segments=segments,
+                transcription_info=transcription_info,
+            )
+
+            caption_entities = self.caption_repo.save_all(
+                model_name=settings.captioning_model_name,
+                media_id=video_obj.media_id,
+                captions=captions,
+                timestamps=timestamps,
+                frame_ids=frame_ids,
+            )
+
+            yield {
+                "transcription_id": transcription_entity.id,
+                "caption_ids": [c.id for c in caption_entities],
+            }
+        except BaseException:
+            self.extended_video_repo.update_status(
+                media_id, VideoProcessingStatus.FAILED
+            )
+            raise
+        else:
+            self.extended_video_repo.update_status(
+                media_id, VideoProcessingStatus.COMPLETED
+            )
diff --git a/aana_chat_with_video/endpoints/load_video_metadata.py b/aana_chat_with_video/endpoints/load_video_metadata.py
new file mode 100644
index 0000000..b038d92
--- /dev/null
+++ b/aana_chat_with_video/endpoints/load_video_metadata.py
@@ -0,0 +1,26 @@
+from typing import TypedDict
+
+from aana.api.api_generation import Endpoint
+from aana.core.models.media import MediaId
+from aana.core.models.video import VideoMetadata
+from aana.storage.repository.extended_video import ExtendedVideoRepository
+
+
+class LoadVideoMetadataOutput(TypedDict):
+    """The output of the load video metadata endpoint."""
+
+    metadata: VideoMetadata
+
+
+class LoadVideoMetadataEndpoint(Endpoint):
+    """Load video metadata endpoint."""
+
+    async def initialize(self):
+        """Initialize the endpoint."""
+        await super().initialize()
+        self.video_repo = ExtendedVideoRepository(self.session)
+
+    async def run(self, media_id: MediaId) -> LoadVideoMetadataOutput:
+        """Load video metadata."""
+        video_metadata = self.video_repo.get_metadata(media_id)
+        return LoadVideoMetadataOutput(metadata=video_metadata)
diff --git a/aana_chat_with_video/endpoints/video_chat.py b/aana_chat_with_video/endpoints/video_chat.py
new file mode 100644
index 0000000..71c3413
--- /dev/null
+++ b/aana_chat_with_video/endpoints/video_chat.py
@@ -0,0 +1,82 @@
+import json
+from collections.abc import AsyncGenerator
+from typing import Annotated, TypedDict
+
+from pydantic import Field
+
+from aana.api.api_generation import Endpoint
+from aana.core.models.chat import Question
+from aana.core.models.media import MediaId
+from aana.core.models.sampling import SamplingParams
+from aana.deployments.aana_deployment_handle import AanaDeploymentHandle
+from aana.exceptions.db import UnfinishedVideoException
+from aana.storage.models.extended_video import VideoProcessingStatus
+from aana.storage.repository.extended_video import ExtendedVideoRepository
+from aana.storage.repository.extended_video_caption import (
+    ExtendedVideoCaptionRepository,
+)
+from aana.storage.repository.extended_video_transcript import (
+    ExtendedVideoTranscriptRepository,
+)
+from aana_chat_with_video.configs.settings import settings
+from aana_chat_with_video.utils.core import generate_combined_timeline, generate_dialog
+
+
+class VideoChatEndpointOutput(TypedDict):
+    """Video chat endpoint output."""
+
+    completion: Annotated[str, Field(description="Generated text.")]
+
+
+class VideoChatEndpoint(Endpoint):
+    """Video chat endpoint."""
+
+    async def initialize(self):
+        """Initialize the endpoint."""
+        await super().initialize()
+        self.llm_handle = await AanaDeploymentHandle.create("llm_deployment")
+        self.transcript_repo = ExtendedVideoTranscriptRepository(self.session)
+        self.caption_repo = ExtendedVideoCaptionRepository(self.session)
+        self.video_repo = ExtendedVideoRepository(self.session)
+
+    async def run(
+        self, media_id: MediaId, question: Question, sampling_params: SamplingParams
+    ) -> AsyncGenerator[VideoChatEndpointOutput, None]:
+        """Run the video chat endpoint."""
+        # check to see if video already processed
+        video_status = self.video_repo.get_status(media_id)
+        if video_status != VideoProcessingStatus.COMPLETED:
+            raise UnfinishedVideoException(
+                media_id=media_id,
+                status=video_status,
+                message=f"The video data is not available, status: {video_status}",
+            )
+
+        transcription_output = self.transcript_repo.get_transcript(
+            model_name=settings.asr_model_name, media_id=media_id
+        )
+
+        captions_output = self.caption_repo.get_captions(
+            model_name=settings.captioning_model_name, media_id=media_id
+        )
+
+        video_metadata = self.video_repo.get_metadata(media_id)
+
+        timeline_output = generate_combined_timeline(
+            transcription_segments=transcription_output["segments"],
+            captions=captions_output["captions"],
+            caption_timestamps=captions_output["timestamps"],
+        )
+        timeline_json = json.dumps(
+            timeline_output["timeline"], indent=4, separators=(",", ": ")
+        )
+
+        dialog = generate_dialog(
+            metadata=video_metadata,
+            timeline=timeline_json,
+            question=question,
+        )
+        async for item in self.llm_handle.chat_stream(
+            dialog=dialog, sampling_params=sampling_params
+        ):
+            yield {"completion": item["text"]}
diff --git a/aana_chat_with_video/storage/__init__.py b/aana_chat_with_video/storage/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/aana_chat_with_video/storage/models/__init__.py b/aana_chat_with_video/storage/models/__init__.py
new file mode 100644
index 0000000..f9a15af
--- /dev/null
+++ b/aana_chat_with_video/storage/models/__init__.py
@@ -0,0 +1,18 @@
+# ruff: noqa: F401
+# We need to import all db models here and, other than in the class definitions
+# themselves, only import them from aana.models.db directly. The reason for
+# this is the way SQLAlchemy's declarative base works. You can use forward
+# references like `parent = reference("Parent", backreferences="child")`, but the
+# forward reference needs to have been resolved before the first constructor
+# is called so that SqlAlchemy "knows" about it.
+# See:
+# https://docs.pylonsproject.org/projects/pyramid_cookbook/en/latest/database/sqlalchemy.html#importing-all-sqlalchemy-models
+# (even if not using Pyramid)
+
+from aana_chat_with_video.storage.models.extended_video import ExtendedVideoEntity
+from aana_chat_with_video.storage.models.extended_video_caption import (
+    ExtendedVideoCaptionEntity,
+)
+from aana_chat_with_video.storage.models.extended_video_transcript import (
+    ExtendedVideoTranscriptEntity,
+)
diff --git a/aana_chat_with_video/storage/models/extended_video.py b/aana_chat_with_video/storage/models/extended_video.py
new file mode 100644
index 0000000..dcebd48
--- /dev/null
+++ b/aana_chat_with_video/storage/models/extended_video.py
@@ -0,0 +1,53 @@
+from enum import Enum
+
+from sqlalchemy import ForeignKey
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from aana.core.models.media import MediaId
+from aana.storage.models.video import VideoEntity
+from aana_chat_with_video.storage.models.extended_video_caption import (
+    ExtendedVideoCaptionEntity,
+)
+from aana_chat_with_video.storage.models.extended_video_transcript import (
+    ExtendedVideoTranscriptEntity,
+)
+
+
+class VideoProcessingStatus(str, Enum):
+    """Enum for video status."""
+
+    CREATED = "created"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class ExtendedVideoEntity(VideoEntity):
+    """ORM class for videos with additional metadata."""
+
+    __tablename__ = "extended_video"
+
+    id: Mapped[MediaId] = mapped_column(ForeignKey("video.id"), primary_key=True)
+    duration: Mapped[float | None] = mapped_column(comment="Video duration in seconds")
+    status: Mapped[VideoProcessingStatus] = mapped_column(
+        nullable=False,
+        default=VideoProcessingStatus.CREATED,
+        comment="Processing status",
+    )
+
+    captions: Mapped[list[ExtendedVideoCaptionEntity]] = relationship(
+        "ExtendedVideoCaptionEntity",
+        back_populates="video",
+        cascade="all, delete",
+        uselist=True,
+    )
+    transcript: Mapped[list[ExtendedVideoTranscriptEntity]] = relationship(
+        "ExtendedVideoTranscriptEntity",
+        back_populates="video",
+        cascade="all, delete",
+        uselist=True,
+    )
+
+    __mapper_args__ = {  # noqa: RUF012
+        "polymorphic_identity": "extended_video",
+    }
diff --git a/aana_chat_with_video/storage/models/extended_video_caption.py b/aana_chat_with_video/storage/models/extended_video_caption.py
new file mode 100644
index 0000000..56dd873
--- /dev/null
+++ b/aana_chat_with_video/storage/models/extended_video_caption.py
@@ -0,0 +1,52 @@
+from __future__ import annotations  # Let classes use themselves in type annotations
+
+import typing
+
+from sqlalchemy import ForeignKey
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from aana.core.models.media import MediaId  # noqa: TCH001
+from aana.storage.models.caption import CaptionEntity
+
+if typing.TYPE_CHECKING:
+    from aana.core.models.captions import Caption
+
+
+class ExtendedVideoCaptionEntity(CaptionEntity):
+    """ORM model for video captions in extended video."""
+
+    __tablename__ = "extended_video_caption"
+
+    id: Mapped[int] = mapped_column(ForeignKey("caption.id"), primary_key=True)
+
+    media_id: Mapped[MediaId] = mapped_column(
+        ForeignKey("extended_video.id"),
+        nullable=False,
+        comment="Foreign key to video table",
+    )
+
+    video = relationship(
+        "ExtendedVideoEntity", back_populates="captions", uselist=False
+    )
+
+    __mapper_args__ = {  # noqa: RUF012
+        "polymorphic_identity": "extended_video_caption",
+    }
+
+    @classmethod
+    def from_caption_output(
+        cls,
+        model_name: str,
+        caption: Caption,
+        media_id: MediaId,
+        frame_id: int,
+        timestamp: float,
+    ) -> ExtendedVideoCaptionEntity:
+        """Converts a Caption pydantic model to a ExtendedVideoCaptionEntity."""
+        caption_entity = CaptionEntity.from_caption_output(
+            model_name=model_name,
+            frame_id=frame_id,
+            timestamp=timestamp,
+            caption=caption,
+        )
+        return cls.from_parent(caption_entity, media_id=media_id)
diff --git a/aana_chat_with_video/storage/models/extended_video_transcript.py b/aana_chat_with_video/storage/models/extended_video_transcript.py
new file mode 100644
index 0000000..cbcf15b
--- /dev/null
+++ b/aana_chat_with_video/storage/models/extended_video_transcript.py
@@ -0,0 +1,55 @@
+from __future__ import annotations  # Let classes use themselves in type annotations
+
+from typing import TYPE_CHECKING
+
+from sqlalchemy import ForeignKey
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from aana.core.models.media import MediaId  # noqa: TCH001
+from aana.storage.models.transcript import TranscriptEntity
+
+if TYPE_CHECKING:
+    from aana.core.models.asr import (
+        AsrSegments,
+        AsrTranscription,
+        AsrTranscriptionInfo,
+    )
+
+
+class ExtendedVideoTranscriptEntity(TranscriptEntity):
+    """ORM class for extended video transcripts."""
+
+    __tablename__ = "extended_video_transcript"
+
+    id: Mapped[int] = mapped_column(ForeignKey("transcript.id"), primary_key=True)
+    media_id: Mapped[MediaId] = mapped_column(
+        ForeignKey("extended_video.id"),
+        nullable=False,
+        comment="Foreign key to video table",
+    )
+
+    video = relationship(
+        "ExtendedVideoEntity", back_populates="transcript", uselist=False
+    )
+
+    __mapper_args__ = {  # noqa: RUF012
+        "polymorphic_identity": "extended_video_transcript",
+    }
+
+    @classmethod
+    def from_asr_output(
+        cls,
+        model_name: str,
+        media_id: MediaId,
+        info: AsrTranscriptionInfo,
+        transcription: AsrTranscription,
+        segments: AsrSegments,
+    ) -> ExtendedVideoTranscriptEntity:
+        """Converts an AsrTranscriptionInfo and AsrTranscription to a single Transcript entity."""
+        transcript_entity = super().from_asr_output(
+            model_name=model_name,
+            info=info,
+            transcription=transcription,
+            segments=segments,
+        )
+        return cls.from_parent(transcript_entity, media_id=media_id)
diff --git a/aana_chat_with_video/storage/repository/__init__.py b/aana_chat_with_video/storage/repository/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/aana_chat_with_video/storage/repository/extended_video.py b/aana_chat_with_video/storage/repository/extended_video.py
new file mode 100644
index 0000000..55a49a7
--- /dev/null
+++ b/aana_chat_with_video/storage/repository/extended_video.py
@@ -0,0 +1,77 @@
+from sqlalchemy.orm import Session
+
+from aana.core.models.media import MediaId
+from aana.core.models.video import Video, VideoMetadata
+from aana.storage.repository.video import VideoRepository
+from aana_chat_with_video.storage.models.extended_video import (
+    ExtendedVideoEntity,
+    VideoProcessingStatus,
+)
+
+
+class ExtendedVideoRepository(VideoRepository[ExtendedVideoEntity]):
+    """Repository for videos with additional metadata."""
+
+    def __init__(self, session: Session):
+        """Constructor."""
+        super().__init__(session, ExtendedVideoEntity)
+
+    def save(self, video: Video, duration: float | None = None) -> dict:
+        """Saves a video to datastore.
+
+        Args:
+            video (Video): The video object.
+            duration (float): the duration of the video object
+
+        Returns:
+            dict: The dictionary with video and media IDs.
+        """
+        video_entity = ExtendedVideoEntity(
+            id=video.media_id,
+            path=str(video.path),
+            url=video.url,
+            title=video.title,
+            description=video.description,
+            duration=duration,
+        )
+        self.create(video_entity)
+        return video_entity
+
+    def get_status(self, media_id: MediaId) -> VideoProcessingStatus:
+        """Get the status of a video.
+
+        Args:
+            media_id (str): The media ID.
+
+        Returns:
+            VideoProcessingStatus: The status of the video.
+        """
+        entity: ExtendedVideoEntity = self.read(media_id)
+        return entity.status
+
+    def update_status(self, media_id: MediaId, status: VideoProcessingStatus):
+        """Update the status of a video.
+
+        Args:
+            media_id (str): The media ID.
+            status (VideoProcessingStatus): The status of the video.
+        """
+        entity: ExtendedVideoEntity = self.read(media_id)
+        entity.status = status
+        self.session.commit()
+
+    def get_metadata(self, media_id: MediaId) -> VideoMetadata:
+        """Get the metadata of a video.
+
+        Args:
+            media_id (MediaId): The media ID.
+
+        Returns:
+            VideoMetadata: The video metadata.
+        """
+        entity: ExtendedVideoEntity = self.read(media_id)
+        return VideoMetadata(
+            title=entity.title,
+            description=entity.description,
+            duration=entity.duration,
+        )
diff --git a/aana_chat_with_video/storage/repository/extended_video_caption.py b/aana_chat_with_video/storage/repository/extended_video_caption.py
new file mode 100644
index 0000000..04a1377
--- /dev/null
+++ b/aana_chat_with_video/storage/repository/extended_video_caption.py
@@ -0,0 +1,103 @@
+from sqlalchemy.orm import Session
+
+from aana.core.models.captions import Caption, CaptionsList
+from aana.core.models.media import MediaId
+from aana.storage.repository.base import BaseRepository
+from aana_chat_with_video.storage.models.extended_video_caption import (
+    ExtendedVideoCaptionEntity,
+)
+
+
+class ExtendedVideoCaptionRepository(BaseRepository[ExtendedVideoCaptionEntity]):
+    """Repository for Captions."""
+
+    def __init__(self, session: Session):
+        """Constructor."""
+        super().__init__(session, ExtendedVideoCaptionEntity)
+
+    def save(
+        self,
+        model_name: str,
+        media_id: MediaId,
+        caption: Caption,
+        timestamp: float,
+        frame_id: int,
+    ):
+        """Save a caption.
+
+        Args:
+            model_name (str): The name of the model used to generate the caption.
+            media_id (MediaId): The media ID.
+            caption (Caption): The caption.
+            timestamp (float): The timestamp.
+            frame_id (int): The frame ID.
+        """
+        entity = ExtendedVideoCaptionEntity.from_caption_output(
+            model_name=model_name,
+            media_id=media_id,
+            frame_id=frame_id,
+            timestamp=timestamp,
+            caption=caption,
+        )
+        self.create(entity)
+        return entity
+
+    def save_all(
+        self,
+        model_name: str,
+        media_id: MediaId,
+        captions: CaptionsList,
+        timestamps: list[float],
+        frame_ids: list[int],
+    ) -> list[ExtendedVideoCaptionEntity]:
+        """Save captions.
+
+        Args:
+            model_name (str): The name of the model used to generate the captions.
+            media_id (MediaId): the media ID of the video.
+            captions (CaptionsList): The captions.
+            timestamps (list[float]): The timestamps.
+            frame_ids (list[int]): The frame IDs.
+
+        Returns:
+            list[ExtendedVideoCaptionEntity]: The list of caption entities.
+        """
+        entities = [
+            ExtendedVideoCaptionEntity.from_caption_output(
+                model_name=model_name,
+                media_id=media_id,
+                frame_id=frame_id,
+                timestamp=timestamp,
+                caption=caption,
+            )
+            for caption, timestamp, frame_id in zip(
+                captions, timestamps, frame_ids, strict=True
+            )
+        ]
+        results = self.create_multiple(entities)
+        return results
+
+    def get_captions(self, model_name: str, media_id: MediaId) -> dict:
+        """Get the captions for a video.
+
+        Args:
+            model_name (str): The model name.
+            media_id (MediaId): The media ID.
+
+        Returns:
+            dict: The dictionary with the captions, timestamps, and frame IDs.
+        """
+        entities: list[ExtendedVideoCaptionEntity] = (
+            self.session.query(self.model_class)
+            .filter_by(media_id=media_id, model=model_name)
+            .order_by(self.model_class.frame_id)
+            .all()
+        )
+        captions = [c.caption for c in entities]
+        timestamps = [c.timestamp for c in entities]
+        frame_ids = [c.frame_id for c in entities]
+        return {
+            "captions": captions,
+            "timestamps": timestamps,
+            "frame_ids": frame_ids,
+        }
diff --git a/aana_chat_with_video/storage/repository/extended_video_transcript.py b/aana_chat_with_video/storage/repository/extended_video_transcript.py
new file mode 100644
index 0000000..bfed0e6
--- /dev/null
+++ b/aana_chat_with_video/storage/repository/extended_video_transcript.py
@@ -0,0 +1,84 @@
+from sqlalchemy.orm import Session
+
+from aana.core.models.asr import (
+    AsrSegment,
+    AsrSegments,
+    AsrTranscription,
+    AsrTranscriptionInfo,
+)
+from aana.core.models.media import MediaId
+from aana.exceptions.db import NotFoundException
+from aana.storage.repository.transcript import TranscriptRepository
+from aana_chat_with_video.storage.models.extended_video_transcript import (
+    ExtendedVideoTranscriptEntity,
+)
+
+
+class ExtendedVideoTranscriptRepository(
+    TranscriptRepository[ExtendedVideoTranscriptEntity]
+):
+    """Repository for Transcripts."""
+
+    def __init__(self, session: Session):
+        """Constructor."""
+        super().__init__(session, ExtendedVideoTranscriptEntity)
+
+    def save(
+        self,
+        model_name: str,
+        media_id: MediaId,
+        transcription_info: AsrTranscriptionInfo,
+        transcription: AsrTranscription,
+        segments: AsrSegments,
+    ) -> ExtendedVideoTranscriptEntity:
+        """Save transcripts.
+
+        Args:
+            model_name (str): The name of the model used to generate the transcript.
+            media_id (MediaId): The media id of the video
+            transcription_info (AsrTranscriptionInfo): The ASR transcription info.
+            transcription (AsrTranscription): The ASR transcription.
+            segments (AsrSegments): The ASR segments.
+
+        Returns:
+            ExtendedVideoTranscriptEntity: The transcript entity.
+        """
+        transcript_entity = ExtendedVideoTranscriptEntity.from_asr_output(
+            model_name=model_name,
+            media_id=media_id,
+            transcription=transcription,
+            segments=segments,
+            info=transcription_info,
+        )
+        self.session.add(transcript_entity)
+        self.session.commit()
+        return transcript_entity
+
+    def get_transcript(self, model_name: str, media_id: MediaId) -> dict:
+        """Get the transcript for a video.
+
+        Args:
+            model_name (str): The name of the model used to generate the transcript.
+            media_id (MediaId): The media ID.
+
+        Returns:
+            dict: The dictionary with the transcript, segments, and info.
+        """
+        entity = (
+            self.session.query(self.model_class)
+            .filter_by(model=model_name, media_id=media_id)
+            .first()
+        )
+        if not entity:
+            raise NotFoundException(self.table_name, media_id)
+        transcription = AsrTranscription(text=entity.transcript)
+        segments = [AsrSegment(**s) for s in entity.segments]
+        info = AsrTranscriptionInfo(
+            language=entity.language,
+            language_confidence=entity.language_confidence,
+        )
+        return {
+            "transcription": transcription,
+            "segments": segments,
+            "transcription_info": info,
+        }
diff --git a/aana_chat_with_video/utils/core.py b/aana_chat_with_video/utils/core.py
index e69de29..21ec64c 100644
--- a/aana_chat_with_video/utils/core.py
+++ b/aana_chat_with_video/utils/core.py
@@ -0,0 +1,128 @@
+from collections import defaultdict
+from math import floor
+
+from aana.core.models.asr import AsrSegments
+from aana.core.models.chat import ChatDialog, ChatMessage, Question
+from aana.core.models.video import VideoMetadata
+
+
+def generate_dialog(
+    metadata: VideoMetadata,
+    timeline: str,
+    question: Question,
+) -> ChatDialog:
+    """Generates a dialog from the metadata and timeline of a video.
+
+    Args:
+        metadata (VideoMetadata): the metadata of the video
+        timeline (str): the timeline of the video
+        question (Question): the question to ask
+
+    Returns:
+        ChatDialog: the generated dialog
+    """
+    system_prompt_preamble = """You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while ensuring safety. You will be provided with a script in json format for a video containing information from visual captions and audio transcripts. Each entry in the script follows the format:
+
+    {{
+    "start_time":"start_time_in_seconds",
+    "end_time": "end_time_in_seconds",
+    "audio_transcript": "the_transcript_from_automatic_speech_recognition_system",
+    "visual_caption": "the_caption_of_the_visuals_using_computer_vision_system"
+    }}
+    Note that the audio_transcript can sometimes be empty.
+
+    Ensure you do not introduce any new named entities in your output and maintain the utmost factual accuracy in your responses.
+
+    In the addition you will be provided with title of video extracted.
+    """
+    instruction = (
+        "Provide a short and concise answer to the following user's question. "
+        "Avoid mentioning any details about the script in JSON format. "
+        "For example, a good response would be: 'Based on the analysis, "
+        "here are the most relevant/useful/aesthetic moments.' "
+        "A less effective response would be: "
+        "'Based on the provided visual caption/audio transcript, "
+        "here are the most relevant/useful/aesthetic moments. The user question is "
+    )
+
+    user_prompt_template = (
+        "{instruction}"
+        "Given the timeline of audio and visual activities in the video below "
+        "I want to find out the following: {question}"
+        "The timeline is: "
+        "{timeline}"
+        "\n"
+        "The title of the video is {video_title}"
+    )
+
+    messages = []
+    messages.append(ChatMessage(content=system_prompt_preamble, role="system"))
+    messages.append(
+        ChatMessage(
+            content=user_prompt_template.format(
+                instruction=instruction,
+                question=question,
+                timeline=timeline,
+                video_title=metadata.title,
+            ),
+            role="user",
+        )
+    )
+
+    dialog = ChatDialog(messages=messages)
+    return dialog
+
+
+def generate_combined_timeline(
+    transcription_segments: AsrSegments,
+    captions: list[str],
+    caption_timestamps: list[float],
+    chunk_size: float = 10.0,
+):
+    """Generates a combined timeline from the ASR segments and the captions.
+
+    Args:
+        transcription_segments (AsrSegments): the ASR segments
+        captions (list[str]): the captions
+        caption_timestamps (list[float]): the timestamps for the captions
+        chunk_size (float, optional): the chunk size for the combined timeline in seconds. Defaults to 10.0.
+
+    Returns:
+        dict: dictionary containing one key, "timeline", which is a list of dictionaries with the following keys:
+            "start_time": the start time of the chunk in seconds
+            "end_time": the end time of the chunk in seconds
+            "audio_transcript": the audio transcript for the chunk
+            "visual_caption": the visual caption for the chunk
+    """
+    timeline_dict: defaultdict[int, dict[str, list[str]]] = defaultdict(
+        lambda: {"transcription": [], "captions": []}
+    )
+    for segment in transcription_segments:
+        segment_start = segment.time_interval.start
+        chunk_index = floor(segment_start / chunk_size)
+        timeline_dict[chunk_index]["transcription"].append(segment.text)
+
+    if len(captions) != len(caption_timestamps):
+        raise ValueError(  # noqa: TRY003
+            f"Length of captions ({len(captions)}) and timestamps ({len(caption_timestamps)}) do not match"
+        )
+
+    for timestamp, caption in zip(caption_timestamps, captions, strict=True):
+        chunk_index = floor(timestamp / chunk_size)
+        timeline_dict[chunk_index]["captions"].append(caption)
+
+    num_chunks = max(timeline_dict.keys()) + 1
+
+    timeline = [
+        {
+            "start_time": chunk_index * chunk_size,
+            "end_time": (chunk_index + 1) * chunk_size,
+            "audio_transcript": "\n".join(timeline_dict[chunk_index]["transcription"]),
+            "visual_caption": "\n".join(timeline_dict[chunk_index]["captions"]),
+        }
+        for chunk_index in range(num_chunks)
+    ]
+
+    return {
+        "timeline": timeline,
+    }

From ce65fe31e645c53b1e4ce35b909eda784c892f41 Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Fri, 26 Jul 2024 09:06:15 +0000
Subject: [PATCH 2/9] Passing migration function to AanaSDK

---
 aana_chat_with_video/app.py                   |  3 +-
 .../endpoints/delete_video.py                 |  4 ++-
 .../endpoints/get_video_status.py             |  4 ++-
 aana_chat_with_video/endpoints/index_video.py | 12 ++++---
 .../endpoints/load_video_metadata.py          |  4 ++-
 aana_chat_with_video/endpoints/video_chat.py  | 14 ++++----
 aana_chat_with_video/exceptions/core.py       | 32 ++++++++++++++++++-
 7 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/aana_chat_with_video/app.py b/aana_chat_with_video/app.py
index 3ae7ccc..c6fb447 100644
--- a/aana_chat_with_video/app.py
+++ b/aana_chat_with_video/app.py
@@ -1,8 +1,9 @@
 from aana.sdk import AanaSDK
 from aana_chat_with_video.configs.deployments import deployments
 from aana_chat_with_video.configs.endpoints import endpoints
+from aana_chat_with_video.storage.op import run_alembic_migrations
 
-aana_app = AanaSDK(name="aana_chat_with_video")
+aana_app = AanaSDK(name="aana_chat_with_video", migration_func=run_alembic_migrations)
 
 for deployment in deployments:
     aana_app.register_deployment(**deployment)
diff --git a/aana_chat_with_video/endpoints/delete_video.py b/aana_chat_with_video/endpoints/delete_video.py
index 09f602d..50c4060 100644
--- a/aana_chat_with_video/endpoints/delete_video.py
+++ b/aana_chat_with_video/endpoints/delete_video.py
@@ -2,7 +2,9 @@
 
 from aana.api.api_generation import Endpoint
 from aana.core.models.media import MediaId
-from aana.storage.repository.extended_video import ExtendedVideoRepository
+from aana_chat_with_video.storage.repository.extended_video import (
+    ExtendedVideoRepository,
+)
 
 
 class DeleteVideoOutput(TypedDict):
diff --git a/aana_chat_with_video/endpoints/get_video_status.py b/aana_chat_with_video/endpoints/get_video_status.py
index 4bb3566..12c351b 100644
--- a/aana_chat_with_video/endpoints/get_video_status.py
+++ b/aana_chat_with_video/endpoints/get_video_status.py
@@ -3,7 +3,9 @@
 from aana.api.api_generation import Endpoint
 from aana.core.models.media import MediaId
 from aana.core.models.video import VideoStatus
-from aana.storage.repository.extended_video import ExtendedVideoRepository
+from aana_chat_with_video.storage.repository.extended_video import (
+    ExtendedVideoRepository,
+)
 
 
 class VideoStatusOutput(TypedDict):
diff --git a/aana_chat_with_video/endpoints/index_video.py b/aana_chat_with_video/endpoints/index_video.py
index 314fe00..6168c9c 100644
--- a/aana_chat_with_video/endpoints/index_video.py
+++ b/aana_chat_with_video/endpoints/index_video.py
@@ -20,15 +20,17 @@
 from aana.integrations.external.yt_dlp import download_video, get_video_metadata
 from aana.processors.remote import run_remote
 from aana.processors.video import extract_audio
-from aana.storage.models.extended_video import VideoProcessingStatus
-from aana.storage.repository.extended_video import ExtendedVideoRepository
-from aana.storage.repository.extended_video_caption import (
+from aana_chat_with_video.configs.settings import settings
+from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus
+from aana_chat_with_video.storage.repository.extended_video import (
+    ExtendedVideoRepository,
+)
+from aana_chat_with_video.storage.repository.extended_video_caption import (
     ExtendedVideoCaptionRepository,
 )
-from aana.storage.repository.extended_video_transcript import (
+from aana_chat_with_video.storage.repository.extended_video_transcript import (
     ExtendedVideoTranscriptRepository,
 )
-from aana_chat_with_video.configs.settings import settings
 
 if TYPE_CHECKING:
     from aana.core.models.audio import Audio
diff --git a/aana_chat_with_video/endpoints/load_video_metadata.py b/aana_chat_with_video/endpoints/load_video_metadata.py
index b038d92..3c9da6d 100644
--- a/aana_chat_with_video/endpoints/load_video_metadata.py
+++ b/aana_chat_with_video/endpoints/load_video_metadata.py
@@ -3,7 +3,9 @@
 from aana.api.api_generation import Endpoint
 from aana.core.models.media import MediaId
 from aana.core.models.video import VideoMetadata
-from aana.storage.repository.extended_video import ExtendedVideoRepository
+from aana_chat_with_video.storage.repository.extended_video import (
+    ExtendedVideoRepository,
+)
 
 
 class LoadVideoMetadataOutput(TypedDict):
diff --git a/aana_chat_with_video/endpoints/video_chat.py b/aana_chat_with_video/endpoints/video_chat.py
index 71c3413..e13ac00 100644
--- a/aana_chat_with_video/endpoints/video_chat.py
+++ b/aana_chat_with_video/endpoints/video_chat.py
@@ -9,16 +9,18 @@
 from aana.core.models.media import MediaId
 from aana.core.models.sampling import SamplingParams
 from aana.deployments.aana_deployment_handle import AanaDeploymentHandle
-from aana.exceptions.db import UnfinishedVideoException
-from aana.storage.models.extended_video import VideoProcessingStatus
-from aana.storage.repository.extended_video import ExtendedVideoRepository
-from aana.storage.repository.extended_video_caption import (
+from aana_chat_with_video.configs.settings import settings
+from aana_chat_with_video.exceptions.core import UnfinishedVideoException
+from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus
+from aana_chat_with_video.storage.repository.extended_video import (
+    ExtendedVideoRepository,
+)
+from aana_chat_with_video.storage.repository.extended_video_caption import (
     ExtendedVideoCaptionRepository,
 )
-from aana.storage.repository.extended_video_transcript import (
+from aana_chat_with_video.storage.repository.extended_video_transcript import (
     ExtendedVideoTranscriptRepository,
 )
-from aana_chat_with_video.configs.settings import settings
 from aana_chat_with_video.utils.core import generate_combined_timeline, generate_dialog
 
 
diff --git a/aana_chat_with_video/exceptions/core.py b/aana_chat_with_video/exceptions/core.py
index 9693de9..4acc491 100644
--- a/aana_chat_with_video/exceptions/core.py
+++ b/aana_chat_with_video/exceptions/core.py
@@ -1 +1,31 @@
-from aana.exceptions.core import BaseException
+from aana.core.models.media import MediaId
+from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus
+
+
+class UnfinishedVideoException(BaseException):
+    """Exception raised when try to fetch unfinished video.
+
+    Attributes:
+        media_id (int | MediaId): The id of video.
+        status (VideoStatus): The current video status.
+        message (str): The error message.
+    """
+
+    def __init__(
+        self, media_id: int | MediaId, status: VideoProcessingStatus, message: str
+    ):
+        """Constructor.
+
+        Args:
+            media_id (int | MediaId): The id of video.
+            status (VideoStatus): The current video status.
+            message (str): The error message.
+        """
+        super().__init__(media_id=media_id, status=status, message=message)
+        self.media_id = media_id
+        self.status = status
+        self.message = message
+
+    def __reduce__(self):
+        """Used for pickling."""
+        return (self.__class__, (self.media_id, self.status, self.message))

From 1f88685e3cbc11a6557b0646d9064eec8f3e25d7 Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Fri, 26 Jul 2024 09:22:06 +0000
Subject: [PATCH 3/9] Added tests

---
 aana_chat_with_video/storage/op.py     |  55 +++++++++
 aana_chat_with_video/tests/conftest.py | 149 ++++++++++++++++++++++++
 aana_chat_with_video/tests/test_app.py | 152 +++++++++++++++++++++++++
 3 files changed, 356 insertions(+)
 create mode 100644 aana_chat_with_video/storage/op.py
 create mode 100644 aana_chat_with_video/tests/conftest.py
 create mode 100644 aana_chat_with_video/tests/test_app.py

diff --git a/aana_chat_with_video/storage/op.py b/aana_chat_with_video/storage/op.py
new file mode 100644
index 0000000..e2cea44
--- /dev/null
+++ b/aana_chat_with_video/storage/op.py
@@ -0,0 +1,55 @@
+from pathlib import Path
+
+from alembic import command
+from alembic.config import Config
+
+from aana.exceptions.runtime import EmptyMigrationsException
+
+
+def get_alembic_config(
+    app_config, ini_file_path: Path, alembic_data_path: Path
+) -> Config:
+    """Produces an alembic config to run migrations programmatically."""
+    engine = app_config.db_config.get_engine()
+    alembic_config = Config(ini_file_path)
+    alembic_config.set_main_option("script_location", str(alembic_data_path))
+    config_section = alembic_config.get_section(alembic_config.config_ini_section, {})
+    config_section["sqlalchemy.url"] = engine.url
+
+    return alembic_config
+
+
+def run_alembic_migrations(settings):
+    """Runs alembic migrations before starting up."""
+    # We need the path to aana/alembic and aana/alembic.ini
+    # This is a hack until we need something better.
+    current_path = Path(__file__)
+    aana_app_root = current_path.parent.parent  # go up two directories
+    ini_file_path = aana_app_root / "alembic.ini"
+    alembic_data_path = aana_app_root / "alembic"
+    if not alembic_data_path.exists():
+        raise RuntimeError("Alembic directory does not exist.")  # noqa: TRY003
+    versions_path = alembic_data_path / "versions"
+    # Check if the versions directory is empty (no .py files)
+    if not versions_path.exists() or not any(Path(versions_path).glob("*.py")):
+        raise EmptyMigrationsException()
+
+    alembic_config = get_alembic_config(settings, ini_file_path, alembic_data_path)
+    engine = settings.db_config.get_engine()
+    with engine.begin() as connection:
+        alembic_config.attributes["connection"] = connection
+        command.upgrade(alembic_config, "head")
+
+
+def drop_all_tables(settings):
+    """Drops all tables in the database."""
+    # TODO: only allow this in testing mode
+    current_path = Path(__file__)
+    aana_app_root = current_path.parent.parent  # go up two directories
+    ini_file_path = aana_app_root / "alembic.ini"
+    alembic_data_path = aana_app_root / "alembic"
+    if not alembic_data_path.exists():
+        raise RuntimeError("Alembic directory does not exist.")  # noqa: TRY003
+
+    alembic_config = get_alembic_config(settings, ini_file_path, alembic_data_path)
+    command.downgrade(alembic_config, "base")
diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
new file mode 100644
index 0000000..dbca1cb
--- /dev/null
+++ b/aana_chat_with_video/tests/conftest.py
@@ -0,0 +1,149 @@
+import importlib
+import json
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+import requests
+
+from aana.configs.db import DbSettings, SQLiteConfig
+from aana.storage.op import DbType
+from aana.utils.json import jsonify
+
+
+def call_streaming_endpoint(
+    port: int, route_prefix: str, endpoint: str, data: dict
+) -> list:
+    """Call a streaming endpoint.
+
+    Args:
+        port (int): Port of the server.
+        route_prefix (str): Route prefix of the server.
+        endpoint (str): Endpoint to call.
+        data (dict): Data to send to the endpoint.
+
+    Returns:
+        list: List of output chunks. If an error occurs, the list will contain
+            only one element, which is the error response.
+    """
+    output = []
+    r = requests.post(
+        f"http://localhost:{port}{route_prefix}{endpoint}",
+        data={"body": json.dumps(data)},
+        timeout=30,
+        stream=True,
+    )
+    for chunk in r.iter_content(chunk_size=None):
+        chunk_output = json.loads(chunk.decode("utf-8"))
+        output.append(chunk_output)
+        if "error" in chunk_output:
+            return [chunk_output]
+    return output
+
+
+def send_request_to_endpoint(
+    port: int,
+    route_prefix: str,
+    endpoint_path: str,
+    is_streaming: bool,
+    data: dict,
+) -> dict | list:
+    """Call an endpoint.
+
+    Args:
+        target (str): the name of the target.
+        port (int): Port of the server.
+        route_prefix (str): Route prefix of the server.
+        endpoint_path (str): Endpoint to call.
+        is_streaming (bool): If True, the endpoint is a streaming endpoint.
+        data (dict): Data to send to the endpoint.
+
+    Returns:
+        dict | list: Output of the endpoint. If the endpoint is a streaming endpoint, the output will be a list of output chunks.
+            If the endpoint is not a streaming endpoint, the output will be a dict.
+            If an error occurs, the output will be a dict with the error message.
+    """
+    if is_streaming:
+        return call_streaming_endpoint(port, route_prefix, endpoint_path, data)
+    else:
+        r = requests.post(
+            f"http://localhost:{port}{route_prefix}{endpoint_path}",
+            data={"body": json.dumps(data)},
+            timeout=30,
+        )
+        return r.json()
+
+
+@pytest.fixture(scope="module")
+def app_setup():
+    """Setup Ray Serve app for given deployments and endpoints."""
+    # create temporary database
+    tmp_database_path = Path(tempfile.mkstemp(suffix=".db")[1])
+    db_config = DbSettings(
+        datastore_type=DbType.SQLITE,
+        datastore_config=SQLiteConfig(path=tmp_database_path),
+    )
+    # set environment variable for the database config so Ray can find it
+    os.environ["DB_CONFIG"] = jsonify(db_config)
+    print(os.environ["DB_CONFIG"])
+    # reload settings to update the database config
+    import aana.configs.settings
+
+    importlib.reload(aana.configs.settings)
+
+    from aana_chat_with_video.app import aana_app
+
+    aana_app.connect(
+        port=8000, show_logs=True, num_cpus=10
+    )  # pretend we have 10 cpus for testing
+
+    def start_app():
+        aana_app.migrate()
+        aana_app.deploy()
+
+        return aana_app
+
+    yield start_app
+
+    # delete temporary database
+    tmp_database_path.unlink()
+
+    aana_app.shutdown()
+
+
+@pytest.fixture(scope="module")
+def call_endpoint(app_setup):  # noqa: D417
+    """Call endpoint.
+
+    Args:
+        endpoint_path: The endpoint path.
+        data: The data to send.
+        ignore_expected_output: Whether to ignore the expected output. Defaults to False.
+        expected_error: The expected error. Defaults to None.
+    """
+    aana_app = app_setup()
+
+    port = aana_app.port
+    route_prefix = ""
+
+    def _call_endpoint(
+        endpoint_path: str,
+        data: dict,
+        ignore_expected_output: bool = False,
+        expected_error: str | None = None,
+    ) -> dict | list:
+        endpoint = None
+        for e in aana_app.endpoints.values():
+            if e.path == endpoint_path:
+                endpoint = e
+                break
+        if endpoint is None:
+            raise ValueError(f"Endpoint with path {endpoint_path} not found")  # noqa: TRY003
+        is_streaming = endpoint.is_streaming_response()
+
+        return send_request_to_endpoint(
+            port, route_prefix, endpoint_path, is_streaming, data
+        )
+
+    return _call_endpoint
diff --git a/aana_chat_with_video/tests/test_app.py b/aana_chat_with_video/tests/test_app.py
new file mode 100644
index 0000000..c1ec549
--- /dev/null
+++ b/aana_chat_with_video/tests/test_app.py
@@ -0,0 +1,152 @@
+# ruff: noqa: S101
+# Test chat with video endpoints.
+
+from importlib import resources
+
+import pytest
+
+from aana.tests.utils import is_gpu_available
+
+VIDEO_INDEX_ENDPOINT = "/video/index_stream"
+VIDEO_METADATA_ENDPOINT = "/video/metadata"
+VIDEO_CHAT_ENDPOINT = "/video/chat_stream"
+VIDEO_STATUS_ENDPOINT = "/video/status"
+VIDEO_DELETE_ENDPOINT = "/video/delete"
+
+
+@pytest.mark.skipif(
+    not is_gpu_available(),
+    reason="GPU is not available",
+)
+@pytest.mark.parametrize(
+    "video, whisper_params",
+    [
+        (
+            {
+                "url": "https://mobius-public.s3.eu-west-1.amazonaws.com/squirrel.mp4",
+                "media_id": "squirrel.mp4",
+            },
+            {"temperature": 0.0},
+        ),
+        (
+            {
+                "path": str(
+                    resources.path("aana.tests.files.videos", "physicsworks.webm")
+                ),
+                "media_id": "physicsworks.webm",
+            },
+            {"temperature": 0.0},
+        ),
+        (
+            {
+                "path": str(
+                    resources.path("aana.tests.files.videos", "physicsworks_audio.webm")
+                ),
+                "media_id": "physicsworks_audio.webm",
+            },
+            {"temperature": 0.0},
+        ),
+    ],
+)
+def test_chat_with_video(call_endpoint, video, whisper_params):
+    """Test chat with video endpoint."""
+    media_id = video["media_id"]
+
+    call_endpoint(
+        VIDEO_INDEX_ENDPOINT,
+        {"video": video, "whisper_params": whisper_params},
+    )
+
+    # if we try to index the same video again, we should get an error MediaIdAlreadyExistsException
+    call_endpoint(
+        VIDEO_INDEX_ENDPOINT,
+        {"video": video, "whisper_params": whisper_params},
+        expected_error="MediaIdAlreadyExistsException",
+    )
+
+    # load video metadata
+    call_endpoint(
+        VIDEO_METADATA_ENDPOINT,
+        {"media_id": media_id},
+    )
+
+    # get video status
+    call_endpoint(
+        VIDEO_STATUS_ENDPOINT,
+        {"media_id": media_id},
+    )
+
+    # delete video
+    call_endpoint(
+        VIDEO_DELETE_ENDPOINT,
+        {"media_id": media_id},
+        ignore_expected_output=True,
+    )
+
+    # get video status
+    call_endpoint(
+        VIDEO_STATUS_ENDPOINT,
+        {"media_id": media_id},
+        expected_error="NotFoundException",
+    )
+
+    # after deleting the video video metadata should not be available
+    call_endpoint(
+        VIDEO_METADATA_ENDPOINT,
+        {"media_id": media_id},
+        expected_error="NotFoundException",
+    )
+
+    # after deleting the video, we should be able to index it again
+    call_endpoint(
+        VIDEO_INDEX_ENDPOINT,
+        {"video": video, "whisper_params": whisper_params},
+    )
+
+    # load video metadata
+    call_endpoint(
+        VIDEO_METADATA_ENDPOINT,
+        {"media_id": media_id},
+    )
+
+    # chat with video
+    question = "Summarize the video"
+
+    call_endpoint(
+        VIDEO_CHAT_ENDPOINT,
+        {"media_id": media_id, "question": question},
+    )
+
+    # delete video
+    call_endpoint(
+        VIDEO_DELETE_ENDPOINT,
+        {"media_id": media_id},
+        ignore_expected_output=True,
+    )
+
+    # after deleting the video, we should not be able to chat with it
+    call_endpoint(
+        VIDEO_CHAT_ENDPOINT,
+        {"media_id": media_id, "question": question},
+        expected_error="NotFoundException",
+    )
+
+
+@pytest.mark.parametrize(
+    "endpoint, data",
+    [
+        (VIDEO_METADATA_ENDPOINT, {}),
+        (VIDEO_CHAT_ENDPOINT, {}),
+        (VIDEO_CHAT_ENDPOINT, {"media_id": "squirrel.mp4"}),
+        (VIDEO_CHAT_ENDPOINT, {"question": "Summarize the video"}),
+        (VIDEO_INDEX_ENDPOINT, {}),
+        (VIDEO_DELETE_ENDPOINT, {}),
+    ],
+)
+def test_missing_params(call_endpoint, endpoint, data):
+    """Test missing params."""
+    call_endpoint(
+        endpoint,
+        data,
+        expected_error="ValidationError",
+    )

From b29e6d2aa65c317a76a952cac06202aca0464828 Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Fri, 26 Jul 2024 14:44:05 +0000
Subject: [PATCH 4/9] Fixed tests

---
 .../core/models/video_status.py               |  24 +++
 .../endpoints/get_video_status.py             |   2 +-
 aana_chat_with_video/endpoints/index_video.py |   2 +-
 aana_chat_with_video/tests/conftest.py        | 175 ++++++++----------
 aana_chat_with_video/tests/test_app.py        |   6 +-
 pyproject.toml                                |  12 +-
 6 files changed, 115 insertions(+), 106 deletions(-)
 create mode 100644 aana_chat_with_video/core/models/video_status.py

diff --git a/aana_chat_with_video/core/models/video_status.py b/aana_chat_with_video/core/models/video_status.py
new file mode 100644
index 0000000..a1c5433
--- /dev/null
+++ b/aana_chat_with_video/core/models/video_status.py
@@ -0,0 +1,24 @@
+from typing import Annotated, Any
+
+from pydantic import Field, ValidationInfo, ValidatorFunctionWrapHandler, WrapValidator
+
+from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus
+
+
+def process_video_status(
+    v: Any, handler: ValidatorFunctionWrapHandler, info: ValidationInfo
+) -> str:
+    """Validates the media_id."""
+    if isinstance(v, str):
+        return VideoProcessingStatus(v)
+    return v
+
+
+VideoStatus = Annotated[
+    VideoProcessingStatus,
+    Field(description="Video processing status."),
+    WrapValidator(process_video_status),
+]
+"""
+Video processing status.
+"""
diff --git a/aana_chat_with_video/endpoints/get_video_status.py b/aana_chat_with_video/endpoints/get_video_status.py
index 12c351b..01ff10d 100644
--- a/aana_chat_with_video/endpoints/get_video_status.py
+++ b/aana_chat_with_video/endpoints/get_video_status.py
@@ -2,7 +2,7 @@
 
 from aana.api.api_generation import Endpoint
 from aana.core.models.media import MediaId
-from aana.core.models.video import VideoStatus
+from aana_chat_with_video.core.models.video_status import VideoStatus
 from aana_chat_with_video.storage.repository.extended_video import (
     ExtendedVideoRepository,
 )
diff --git a/aana_chat_with_video/endpoints/index_video.py b/aana_chat_with_video/endpoints/index_video.py
index 6168c9c..1abce82 100644
--- a/aana_chat_with_video/endpoints/index_video.py
+++ b/aana_chat_with_video/endpoints/index_video.py
@@ -140,7 +140,7 @@ async def run(  # noqa: C901
                 yield {
                     "transcription": whisper_output["transcription"],
                     "segments": whisper_output["segments"],
-                    "info": whisper_output["transcription_info"],
+                    "transcription_info": whisper_output["transcription_info"],
                 }
             transcription = sum(transcription_list, AsrTranscription())
             segments = sum(segments_list, AsrSegments())
diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
index dbca1cb..2437f28 100644
--- a/aana_chat_with_video/tests/conftest.py
+++ b/aana_chat_with_video/tests/conftest.py
@@ -1,149 +1,122 @@
+# ruff: noqa: S101
 import importlib
 import json
 import os
 import tempfile
 from pathlib import Path
+from typing import Any
 
 import pytest
 import requests
+from pydantic import ValidationError
 
+from aana.api.api_generation import Endpoint
 from aana.configs.db import DbSettings, SQLiteConfig
+from aana.sdk import AanaSDK
 from aana.storage.op import DbType
 from aana.utils.json import jsonify
 
 
-def call_streaming_endpoint(
-    port: int, route_prefix: str, endpoint: str, data: dict
-) -> list:
-    """Call a streaming endpoint.
-
-    Args:
-        port (int): Port of the server.
-        route_prefix (str): Route prefix of the server.
-        endpoint (str): Endpoint to call.
-        data (dict): Data to send to the endpoint.
-
-    Returns:
-        list: List of output chunks. If an error occurs, the list will contain
-            only one element, which is the error response.
-    """
-    output = []
-    r = requests.post(
-        f"http://localhost:{port}{route_prefix}{endpoint}",
-        data={"body": json.dumps(data)},
-        timeout=30,
-        stream=True,
-    )
-    for chunk in r.iter_content(chunk_size=None):
-        chunk_output = json.loads(chunk.decode("utf-8"))
-        output.append(chunk_output)
-        if "error" in chunk_output:
-            return [chunk_output]
-    return output
-
-
-def send_request_to_endpoint(
-    port: int,
-    route_prefix: str,
-    endpoint_path: str,
-    is_streaming: bool,
-    data: dict,
-) -> dict | list:
-    """Call an endpoint.
-
-    Args:
-        target (str): the name of the target.
-        port (int): Port of the server.
-        route_prefix (str): Route prefix of the server.
-        endpoint_path (str): Endpoint to call.
-        is_streaming (bool): If True, the endpoint is a streaming endpoint.
-        data (dict): Data to send to the endpoint.
-
-    Returns:
-        dict | list: Output of the endpoint. If the endpoint is a streaming endpoint, the output will be a list of output chunks.
-            If the endpoint is not a streaming endpoint, the output will be a dict.
-            If an error occurs, the output will be a dict with the error message.
-    """
-    if is_streaming:
-        return call_streaming_endpoint(port, route_prefix, endpoint_path, data)
+def send_api_request(
+    endpoint: Endpoint,
+    app: AanaSDK,
+    data: dict[str, Any],
+    timeout: int = 30,
+) -> dict[str, Any] | list[dict[str, Any]]:
+    """Call an endpoint, handling both streaming and non-streaming responses."""
+    url = f"http://localhost:{app.port}{endpoint.path}"
+    payload = {"body": json.dumps(data)}
+
+    if endpoint.is_streaming_response():
+        output = []
+        with requests.post(url, data=payload, timeout=timeout, stream=True) as r:
+            for chunk in r.iter_content(chunk_size=None):
+                chunk_output = json.loads(chunk.decode("utf-8"))
+                output.append(chunk_output)
+                if "error" in chunk_output:
+                    return [chunk_output]
+        return output
     else:
-        r = requests.post(
-            f"http://localhost:{port}{route_prefix}{endpoint_path}",
-            data={"body": json.dumps(data)},
-            timeout=30,
-        )
-        return r.json()
+        response = requests.post(url, data=payload, timeout=timeout)
+        return response.json()
+
+
+def verify_output(
+    endpoint: Endpoint,
+    response: dict[str, Any] | list[dict[str, Any]],
+    expected_error: str | None = None,
+) -> None:
+    """Verify the output of an endpoint call."""
+    is_streaming = endpoint.is_streaming_response()
+    ResponseModel = endpoint.get_response_model()
+    if expected_error:
+        error = response[0]["error"] if is_streaming else response["error"]
+        assert error == expected_error, response
+    else:
+        try:
+            if is_streaming:
+                for item in response:
+                    ResponseModel.model_validate(item, strict=True)
+            else:
+                ResponseModel.model_validate(response, strict=True)
+        except ValidationError as e:
+            raise AssertionError(  # noqa: TRY003
+                f"Validation failed. Errors:\n{e}\n\nResponse: {response}"
+            ) from e
 
 
 @pytest.fixture(scope="module")
 def app_setup():
-    """Setup Ray Serve app for given deployments and endpoints."""
-    # create temporary database
+    """Setup Ray Serve app for testing."""
+    # Create a temporary database for testing
     tmp_database_path = Path(tempfile.mkstemp(suffix=".db")[1])
     db_config = DbSettings(
         datastore_type=DbType.SQLITE,
         datastore_config=SQLiteConfig(path=tmp_database_path),
     )
-    # set environment variable for the database config so Ray can find it
     os.environ["DB_CONFIG"] = jsonify(db_config)
-    print(os.environ["DB_CONFIG"])
-    # reload settings to update the database config
+
+    # Reload the settings to update the database path
     import aana.configs.settings
 
     importlib.reload(aana.configs.settings)
 
+    # Start the app
     from aana_chat_with_video.app import aana_app
 
-    aana_app.connect(
-        port=8000, show_logs=True, num_cpus=10
-    )  # pretend we have 10 cpus for testing
-
-    def start_app():
-        aana_app.migrate()
-        aana_app.deploy()
-
-        return aana_app
+    aana_app.connect(port=8000, show_logs=True, num_cpus=10)
+    aana_app.migrate()
+    aana_app.deploy()
 
-    yield start_app
+    yield aana_app
 
-    # delete temporary database
     tmp_database_path.unlink()
-
     aana_app.shutdown()
 
 
 @pytest.fixture(scope="module")
-def call_endpoint(app_setup):  # noqa: D417
-    """Call endpoint.
-
-    Args:
-        endpoint_path: The endpoint path.
-        data: The data to send.
-        ignore_expected_output: Whether to ignore the expected output. Defaults to False.
-        expected_error: The expected error. Defaults to None.
-    """
-    aana_app = app_setup()
-
-    port = aana_app.port
-    route_prefix = ""
+def call_endpoint(app_setup):
+    """Call an endpoint and verify the output."""
+    aana_app: AanaSDK = app_setup
 
     def _call_endpoint(
         endpoint_path: str,
-        data: dict,
-        ignore_expected_output: bool = False,
+        data: dict[str, Any],
         expected_error: str | None = None,
-    ) -> dict | list:
-        endpoint = None
-        for e in aana_app.endpoints.values():
-            if e.path == endpoint_path:
-                endpoint = e
-                break
+    ) -> dict[str, Any] | list[dict[str, Any]]:
+        endpoint = next(
+            (e for e in aana_app.endpoints.values() if e.path == endpoint_path), None
+        )
         if endpoint is None:
             raise ValueError(f"Endpoint with path {endpoint_path} not found")  # noqa: TRY003
-        is_streaming = endpoint.is_streaming_response()
 
-        return send_request_to_endpoint(
-            port, route_prefix, endpoint_path, is_streaming, data
+        response = send_api_request(endpoint=endpoint, app=aana_app, data=data)
+        verify_output(
+            endpoint=endpoint,
+            response=response,
+            expected_error=expected_error,
         )
 
+        return response
+
     return _call_endpoint
diff --git a/aana_chat_with_video/tests/test_app.py b/aana_chat_with_video/tests/test_app.py
index c1ec549..1a35498 100644
--- a/aana_chat_with_video/tests/test_app.py
+++ b/aana_chat_with_video/tests/test_app.py
@@ -80,7 +80,6 @@ def test_chat_with_video(call_endpoint, video, whisper_params):
     call_endpoint(
         VIDEO_DELETE_ENDPOINT,
         {"media_id": media_id},
-        ignore_expected_output=True,
     )
 
     # get video status
@@ -121,7 +120,6 @@ def test_chat_with_video(call_endpoint, video, whisper_params):
     call_endpoint(
         VIDEO_DELETE_ENDPOINT,
         {"media_id": media_id},
-        ignore_expected_output=True,
     )
 
     # after deleting the video, we should not be able to chat with it
@@ -132,6 +130,10 @@ def test_chat_with_video(call_endpoint, video, whisper_params):
     )
 
 
+@pytest.mark.skipif(
+    not is_gpu_available(),
+    reason="GPU is not available",
+)
 @pytest.mark.parametrize(
     "endpoint, data",
     [
diff --git a/pyproject.toml b/pyproject.toml
index 31f0824..ce6d657 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,13 +7,23 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.10"
-aana = ">=0.2.0"
 
 
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.29.4"
 ruff = "^0.1.5"
+pytest-asyncio = "^0.23.6"
+pytest-dotenv = "^0.5.2"
+pytest-env = "^1.1.3"
+pytest-mock = "^3.12.0"
+pytest-timeout = "^2.2.0"
 
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+timeout = 600
+env = [
+    "TEST_MODE=True"
+]

From 2db21aa58a99c48677d71875462926d1751ce6ca Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Mon, 29 Jul 2024 10:00:32 +0000
Subject: [PATCH 5/9] Added tests and updated readme.

---
 .github/workflows/tests.yml                   |  33 ++
 README.md                                     |  66 +---
 .../alembic/versions/7da69bf375e7_init.py     | 343 ++++++++++++++----
 aana_chat_with_video/core/__init__.py         |   0
 aana_chat_with_video/core/prompts/loader.py   |   5 +-
 .../endpoints/delete_video.py                 |   2 +-
 aana_chat_with_video/tests/conftest.py        |  35 ++
 .../tests/test_extended_video_caption_repo.py | 123 +++++++
 .../tests/test_extended_video_repo.py         |  76 ++++
 .../test_extended_video_transcript_repo.py    |  87 +++++
 notebooks/chat_with_video_demo.ipynb          | 178 +++++++++
 pyproject.toml                                |   6 +-
 12 files changed, 814 insertions(+), 140 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 aana_chat_with_video/core/__init__.py
 create mode 100644 aana_chat_with_video/tests/test_extended_video_caption_repo.py
 create mode 100644 aana_chat_with_video/tests/test_extended_video_repo.py
 create mode 100644 aana_chat_with_video/tests/test_extended_video_transcript_repo.py
 create mode 100644 notebooks/chat_with_video_demo.ipynb

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..0834cb1
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,33 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - '**'  # Runs on push to any branch
+  pull_request:
+    branches:
+      - '**'  # Runs on pull requests to any branch
+  workflow_dispatch:  # Allows for manual triggering
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Bootstrap poetry
+      run: |
+        curl -sSL https://install.python-poetry.org | python - -y
+    - name: Update PATH
+      run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        poetry install
+    - name: Test with pytest
+      run: poetry run pytest -vv
diff --git a/README.md b/README.md
index fa560d9..d85524c 100644
--- a/README.md
+++ b/README.md
@@ -1,45 +1,6 @@
-# Aana Application Template
-
-[Aana SDK](https://github.com/mobiusml/aana_sdk) is a powerful framework for building multimodal applications. It facilitates the large-scale deployment of machine learning models, including those for vision, audio, and language, and supports Retrieval-Augmented Generation (RAG) systems. This enables the development of advanced applications such as search engines, recommendation systems, and data insights platforms.
-
-This repository contains a template that you can use to start building your own Aana application. It will help you get started with the Aana SDK and provide you with a basic structure for your application and its dependencies.
-
-## How to use this template
-
-1. Click on [Use this template](https://github.com/mobiusml/aana_app_template/generate).
-2. Give your repository a name and click on "Create repository". The name of the repository will also be the name of your application and the Python package.
-3. Wait for the first workflow to finish. This will rename the package to match the repository name.
-4. Clone the repository to your local machine and start building your application.
-5. Change the [LICENSE](/LICENSE) file to match your project's license. The default license is the Apache License 2.0.
-
-## Getting started
-
-The project template uses [Poetry](https://python-poetry.org/) for dependency management. To install the project, run the following commands:
-
-```bash
-poetry install
-```
-
-See [Tutorial](https://github.com/mobiusml/aana_sdk/blob/main/docs/tutorial.md) for more information on how to build your application.
-
-## Project structure
-
-```
-aana_chat_with_video/
-├── config/                   | various configs, including settings, deployments and endpoints
-│   ├── endpoints.py          | list of endpoints to deploy
-│   ├── deployments.py        | list of deployments (models) to deploy
-│   └── settings.py           | app settings
-├── core/                     | core models and functionality
-│   ├── models/               | data models
-│   └── prompts/              | prompt templates for LLMs
-├── deployments/              | custom deployments
-├── endpoints/                | endpoint classes for the app
-├── exceptions/               | custom exception classes
-├── utils/                    | various utility functionality
-└── app.py                    | main application file
-```
+# Chat with Video App
 
+**Chat with Video App** is a multimodal chat application that allows users to upload a video and ask questions about the video content based on the visual and audio information. See [Chat with Video Demo notebook](notebooks/chat_with_video_demo.ipynb) for more information.
 
 ## Installation
 
@@ -49,16 +10,9 @@ To install the project, follow these steps:
 
 2. Install additional libraries.
 
-```bash
-apt update && apt install -y libgl1
-```
-> **🗒️ Note**
->
-> For optimal performance, you should also install [PyTorch](https://pytorch.org/get-started/locally/) version >=2.1 appropriate for your system. You can continue directly to the next step, but it will install a default version that may not make optimal use of your system's resources, for example, a GPU or even some SIMD operations. Therefore we recommend choosing your PyTorch package carefully and installing it manually.
+For optimal performance, you should also install [PyTorch](https://pytorch.org/get-started/locally/) version >=2.1 appropriate for your system. You can continue directly to the next step, but it will install a default version that may not make optimal use of your system's resources, for example, a GPU or even some SIMD operations. Therefore we recommend choosing your PyTorch package carefully and installing it manually.
 
-> **🗒️ Note**
->
-> Some models use Flash Attention. Install Flash Attention library for better performance. See [flash attention installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) for more details and supported GPUs.
+Some models use Flash Attention. Install Flash Attention library for better performance. See [flash attention installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) for more details and supported GPUs.
 
 3. Install the package with poetry.
 
@@ -73,7 +27,7 @@ poetry install
 4. Run the app.
 
 ```bash
-aana deploy aana_chat_with_video.app:aana_app
+CUDA_VISIBLE_DEVICES="0" aana deploy aana_chat_with_video.app:aana_app
 ```
 
 ## Usage
@@ -83,14 +37,14 @@ To use the project, follow these steps:
 1. Run the app as described in the installation section.
 
 ```bash
-aana deploy aana_chat_with_video.app:aana_app
+CUDA_VISIBLE_DEVICES="0" aana deploy aana_chat_with_video.app:aana_app
 ```
 
 Once the application is running, you will see the message `Deployed successfully.` in the logs. It will also show the URL for the API documentation.
 
 > **⚠️ Warning**
 >
-> If the application is using GPU, make sure that the GPU is available and the application can access it.
+> The applications require 1 largs GPUs to run. GPU should have at least 48GB of memory.
 >
 > The applications will detect the available GPU automatically but you need to make sure that `CUDA_VISIBLE_DEVICES` is set correctly.
 > 
@@ -100,8 +54,4 @@ Once the application is running, you will see the message `Deployed successfully
 
 2. Send a POST request to the app.
 
-For example, if your application has `/summary` endpoint that accepts videos, you can send a POST request like this:
-
-```bash
-curl -X POST http://127.0.0.1:8000/summary -Fbody='{"video":{"url":"https://www.youtube.com/watch?v=VhJFyyukAzA"}}'
-```
+See [Chat with Video Demo notebook](notebooks/chat_with_video_demo.ipynb) for more information.
\ No newline at end of file
diff --git a/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py b/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
index 277b042..df110da 100644
--- a/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
+++ b/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
@@ -1,4 +1,4 @@
-"""init
+"""init.
 
 Revision ID: 7da69bf375e7
 Revises: 
@@ -11,7 +11,7 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = '7da69bf375e7'
+revision: str = "7da69bf375e7"
 down_revision: str | None = None
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
@@ -20,79 +20,270 @@
 def upgrade() -> None:
     """Upgrade database to this revision from previous."""
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('caption',
-    sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
-    sa.Column('model', sa.String(), nullable=False, comment='Name of model used to generate the caption'),
-    sa.Column('frame_id', sa.Integer(), nullable=False, comment='The 0-based frame id of video for caption'),
-    sa.Column('caption', sa.String(), nullable=False, comment='Frame caption'),
-    sa.Column('timestamp', sa.Float(), nullable=False, comment='Frame timestamp in seconds'),
-    sa.Column('caption_type', sa.String(), nullable=False, comment='The type of caption'),
-    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
-    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_caption'))
+    op.create_table(
+        "caption",
+        sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column(
+            "model",
+            sa.String(),
+            nullable=False,
+            comment="Name of model used to generate the caption",
+        ),
+        sa.Column(
+            "frame_id",
+            sa.Integer(),
+            nullable=False,
+            comment="The 0-based frame id of video for caption",
+        ),
+        sa.Column("caption", sa.String(), nullable=False, comment="Frame caption"),
+        sa.Column(
+            "timestamp",
+            sa.Float(),
+            nullable=False,
+            comment="Frame timestamp in seconds",
+        ),
+        sa.Column(
+            "caption_type", sa.String(), nullable=False, comment="The type of caption"
+        ),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is inserted",
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is updated",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_caption")),
     )
-    op.create_table('media',
-    sa.Column('id', sa.String(length=36), nullable=False, comment='Unique identifier for the media'),
-    sa.Column('media_type', sa.String(), nullable=False, comment='The type of media'),
-    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
-    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_media'))
+    op.create_table(
+        "media",
+        sa.Column(
+            "id",
+            sa.String(length=36),
+            nullable=False,
+            comment="Unique identifier for the media",
+        ),
+        sa.Column(
+            "media_type", sa.String(), nullable=False, comment="The type of media"
+        ),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is inserted",
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is updated",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_media")),
     )
-    op.create_table('tasks',
-    sa.Column('id', sa.UUID(), nullable=False, comment='Task ID'),
-    sa.Column('endpoint', sa.String(), nullable=False, comment='The endpoint to which the task is assigned'),
-    sa.Column('data', sa.PickleType(), nullable=False, comment='Data for the task'),
-    sa.Column('status', sa.Enum('CREATED', 'ASSIGNED', 'COMPLETED', 'RUNNING', 'FAILED', 'NOT_FINISHED', name='status'), nullable=False, comment='Status of the task'),
-    sa.Column('priority', sa.Integer(), nullable=False, comment='Priority of the task (0 is the lowest)'),
-    sa.Column('assigned_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True, comment='Timestamp when the task was assigned'),
-    sa.Column('completed_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True, comment='Timestamp when the task was completed'),
-    sa.Column('progress', sa.Float(), nullable=False, comment='Progress of the task in percentage'),
-    sa.Column('result', sa.JSON(), nullable=True, comment='Result of the task in JSON format'),
-    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
-    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_tasks'))
+    op.create_table(
+        "tasks",
+        sa.Column("id", sa.UUID(), nullable=False, comment="Task ID"),
+        sa.Column(
+            "endpoint",
+            sa.String(),
+            nullable=False,
+            comment="The endpoint to which the task is assigned",
+        ),
+        sa.Column("data", sa.PickleType(), nullable=False, comment="Data for the task"),
+        sa.Column(
+            "status",
+            sa.Enum(
+                "CREATED",
+                "ASSIGNED",
+                "COMPLETED",
+                "RUNNING",
+                "FAILED",
+                "NOT_FINISHED",
+                name="status",
+            ),
+            nullable=False,
+            comment="Status of the task",
+        ),
+        sa.Column(
+            "priority",
+            sa.Integer(),
+            nullable=False,
+            comment="Priority of the task (0 is the lowest)",
+        ),
+        sa.Column(
+            "assigned_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=True,
+            comment="Timestamp when the task was assigned",
+        ),
+        sa.Column(
+            "completed_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=True,
+            comment="Timestamp when the task was completed",
+        ),
+        sa.Column(
+            "progress",
+            sa.Float(),
+            nullable=False,
+            comment="Progress of the task in percentage",
+        ),
+        sa.Column(
+            "result",
+            sa.JSON(),
+            nullable=True,
+            comment="Result of the task in JSON format",
+        ),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is inserted",
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is updated",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_tasks")),
     )
-    op.create_table('transcript',
-    sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
-    sa.Column('model', sa.String(), nullable=False, comment='Name of model used to generate transcript'),
-    sa.Column('transcript', sa.String(), nullable=False, comment='Full text transcript of media'),
-    sa.Column('segments', sa.JSON(), nullable=False, comment='Segments of the transcript'),
-    sa.Column('language', sa.String(), nullable=False, comment='Language of the transcript as predicted by model'),
-    sa.Column('language_confidence', sa.Float(), nullable=False, comment='Confidence score of language prediction'),
-    sa.Column('transcript_type', sa.String(), nullable=False, comment='The type of transcript'),
-    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is inserted'),
-    sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False, comment='Timestamp when row is updated'),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_transcript'))
+    op.create_table(
+        "transcript",
+        sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column(
+            "model",
+            sa.String(),
+            nullable=False,
+            comment="Name of model used to generate transcript",
+        ),
+        sa.Column(
+            "transcript",
+            sa.String(),
+            nullable=False,
+            comment="Full text transcript of media",
+        ),
+        sa.Column(
+            "segments", sa.JSON(), nullable=False, comment="Segments of the transcript"
+        ),
+        sa.Column(
+            "language",
+            sa.String(),
+            nullable=False,
+            comment="Language of the transcript as predicted by model",
+        ),
+        sa.Column(
+            "language_confidence",
+            sa.Float(),
+            nullable=False,
+            comment="Confidence score of language prediction",
+        ),
+        sa.Column(
+            "transcript_type",
+            sa.String(),
+            nullable=False,
+            comment="The type of transcript",
+        ),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is inserted",
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("(CURRENT_TIMESTAMP)"),
+            nullable=False,
+            comment="Timestamp when row is updated",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_transcript")),
     )
-    op.create_table('video',
-    sa.Column('id', sa.String(length=36), nullable=False),
-    sa.Column('path', sa.String(), nullable=True, comment='Path'),
-    sa.Column('url', sa.String(), nullable=True, comment='URL'),
-    sa.Column('title', sa.String(), nullable=True, comment='Title'),
-    sa.Column('description', sa.String(), nullable=True, comment='Description'),
-    sa.ForeignKeyConstraint(['id'], ['media.id'], name=op.f('fk_video_id_media')),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_video'))
+    op.create_table(
+        "video",
+        sa.Column("id", sa.String(length=36), nullable=False),
+        sa.Column("path", sa.String(), nullable=True, comment="Path"),
+        sa.Column("url", sa.String(), nullable=True, comment="URL"),
+        sa.Column("title", sa.String(), nullable=True, comment="Title"),
+        sa.Column("description", sa.String(), nullable=True, comment="Description"),
+        sa.ForeignKeyConstraint(["id"], ["media.id"], name=op.f("fk_video_id_media")),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_video")),
     )
-    op.create_table('extended_video',
-    sa.Column('id', sa.String(length=36), nullable=False),
-    sa.Column('duration', sa.Float(), nullable=True, comment='Video duration in seconds'),
-    sa.Column('status', sa.Enum('CREATED', 'RUNNING', 'COMPLETED', 'FAILED', name='videoprocessingstatus'), nullable=False, comment='Processing status'),
-    sa.ForeignKeyConstraint(['id'], ['video.id'], name=op.f('fk_extended_video_id_video')),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video'))
+    op.create_table(
+        "extended_video",
+        sa.Column("id", sa.String(length=36), nullable=False),
+        sa.Column(
+            "duration", sa.Float(), nullable=True, comment="Video duration in seconds"
+        ),
+        sa.Column(
+            "status",
+            sa.Enum(
+                "CREATED",
+                "RUNNING",
+                "COMPLETED",
+                "FAILED",
+                name="videoprocessingstatus",
+            ),
+            nullable=False,
+            comment="Processing status",
+        ),
+        sa.ForeignKeyConstraint(
+            ["id"], ["video.id"], name=op.f("fk_extended_video_id_video")
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_extended_video")),
     )
-    op.create_table('extended_video_caption',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'),
-    sa.ForeignKeyConstraint(['id'], ['caption.id'], name=op.f('fk_extended_video_caption_id_caption')),
-    sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_caption_media_id_extended_video')),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_caption'))
+    op.create_table(
+        "extended_video_caption",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "media_id",
+            sa.String(length=36),
+            nullable=False,
+            comment="Foreign key to video table",
+        ),
+        sa.ForeignKeyConstraint(
+            ["id"], ["caption.id"], name=op.f("fk_extended_video_caption_id_caption")
+        ),
+        sa.ForeignKeyConstraint(
+            ["media_id"],
+            ["extended_video.id"],
+            name=op.f("fk_extended_video_caption_media_id_extended_video"),
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_extended_video_caption")),
     )
-    op.create_table('extended_video_transcript',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'),
-    sa.ForeignKeyConstraint(['id'], ['transcript.id'], name=op.f('fk_extended_video_transcript_id_transcript')),
-    sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_transcript_media_id_extended_video')),
-    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_transcript'))
+    op.create_table(
+        "extended_video_transcript",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "media_id",
+            sa.String(length=36),
+            nullable=False,
+            comment="Foreign key to video table",
+        ),
+        sa.ForeignKeyConstraint(
+            ["id"],
+            ["transcript.id"],
+            name=op.f("fk_extended_video_transcript_id_transcript"),
+        ),
+        sa.ForeignKeyConstraint(
+            ["media_id"],
+            ["extended_video.id"],
+            name=op.f("fk_extended_video_transcript_media_id_extended_video"),
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_extended_video_transcript")),
     )
     # ### end Alembic commands ###
 
@@ -100,12 +291,12 @@ def upgrade() -> None:
 def downgrade() -> None:
     """Downgrade database from this revision to previous."""
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_table('extended_video_transcript')
-    op.drop_table('extended_video_caption')
-    op.drop_table('extended_video')
-    op.drop_table('video')
-    op.drop_table('transcript')
-    op.drop_table('tasks')
-    op.drop_table('media')
-    op.drop_table('caption')
+    op.drop_table("extended_video_transcript")
+    op.drop_table("extended_video_caption")
+    op.drop_table("extended_video")
+    op.drop_table("video")
+    op.drop_table("transcript")
+    op.drop_table("tasks")
+    op.drop_table("media")
+    op.drop_table("caption")
     # ### end Alembic commands ###
diff --git a/aana_chat_with_video/core/__init__.py b/aana_chat_with_video/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/aana_chat_with_video/core/prompts/loader.py b/aana_chat_with_video/core/prompts/loader.py
index 4bb0d35..1654fc9 100644
--- a/aana_chat_with_video/core/prompts/loader.py
+++ b/aana_chat_with_video/core/prompts/loader.py
@@ -19,7 +19,8 @@ def get_prompt_template(name: str) -> Template:
     Returns:
         Template: The prompt template.
     """
-    env = Environment(loader=PackageLoader(
-        "aana_chat_with_video", "core", "prompts"))
+    env = Environment(
+        loader=PackageLoader("aana_chat_with_video.core", "prompts"), autoescape=True
+    )
     template = env.get_template(f"{name}.j2")
     return template
diff --git a/aana_chat_with_video/endpoints/delete_video.py b/aana_chat_with_video/endpoints/delete_video.py
index 50c4060..5481cba 100644
--- a/aana_chat_with_video/endpoints/delete_video.py
+++ b/aana_chat_with_video/endpoints/delete_video.py
@@ -22,6 +22,6 @@ async def initialize(self):
         self.video_repo = ExtendedVideoRepository(self.session)
 
     async def run(self, media_id: MediaId) -> DeleteVideoOutput:
-        """Delete media."""
+        """Delete video."""
         self.video_repo.delete(media_id)
         return DeleteVideoOutput(media_id=media_id)
diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
index 2437f28..b6ef17a 100644
--- a/aana_chat_with_video/tests/conftest.py
+++ b/aana_chat_with_video/tests/conftest.py
@@ -12,9 +12,44 @@
 
 from aana.api.api_generation import Endpoint
 from aana.configs.db import DbSettings, SQLiteConfig
+from aana.exceptions.runtime import EmptyMigrationsException
 from aana.sdk import AanaSDK
 from aana.storage.op import DbType
+from aana.storage.session import get_session
 from aana.utils.json import jsonify
+from aana_chat_with_video.configs.settings import settings
+from aana_chat_with_video.storage.op import (
+    run_alembic_migrations as run_app_alembic_migrations,
+)
+
+
+@pytest.fixture(scope="function")
+def db_session():
+    """Creates a new database file and session for each test."""
+    tmp_database_path = Path(tempfile.mkstemp(suffix=".db")[1])
+    db_config = DbSettings(
+        datastore_type=DbType.SQLITE,
+        datastore_config=SQLiteConfig(path=tmp_database_path),
+    )
+    os.environ["DB_CONFIG"] = jsonify(db_config)
+
+    # Reload the settings to update the database path
+    import aana.configs.settings
+    import aana_chat_with_video.configs.settings
+
+    importlib.reload(aana.configs.settings)
+    importlib.reload(aana_chat_with_video.configs.settings)
+
+    # Run migrations to set up the schema
+    try:
+        run_app_alembic_migrations(settings)
+    except EmptyMigrationsException:
+        print("No versions found in the custom migrations. Using default migrations.")
+        run_app_alembic_migrations(settings)
+
+    # Create a new session
+    with get_session() as session:
+        yield session
 
 
 def send_api_request(
diff --git a/aana_chat_with_video/tests/test_extended_video_caption_repo.py b/aana_chat_with_video/tests/test_extended_video_caption_repo.py
new file mode 100644
index 0000000..7750927
--- /dev/null
+++ b/aana_chat_with_video/tests/test_extended_video_caption_repo.py
@@ -0,0 +1,123 @@
+# ruff: noqa: S101
+
+import random
+import uuid
+
+import pytest
+
+from aana.core.models.captions import Caption
+from aana.exceptions.db import NotFoundException
+from aana_chat_with_video.storage.repository.extended_video_caption import (
+    ExtendedVideoCaptionRepository,
+)
+
+
+@pytest.fixture(scope="function")
+def dummy_caption():
+    """Creates a dummy caption for testing."""
+    caption = Caption(f"This is a caption {uuid.uuid4()}")
+    frame_id = random.randint(0, 100)  # noqa: S311
+    timestamp = random.random()  # noqa: S311
+    return caption, frame_id, timestamp
+
+
+def test_save_caption(db_session, dummy_caption):
+    """Tests saving a caption."""
+    caption, frame_id, timestamp = dummy_caption
+    model_name = "blip2"
+    media_id = "test_media_id"
+
+    caption_repo = ExtendedVideoCaptionRepository(db_session)
+    caption_entity = caption_repo.save(
+        model_name=model_name,
+        media_id=media_id,
+        caption=caption,
+        frame_id=frame_id,
+        timestamp=timestamp,
+    )
+    caption_id = caption_entity.id
+
+    caption_entity = caption_repo.read(caption_id)
+    assert caption_entity.model == model_name
+    assert caption_entity.media_id == media_id
+    assert caption_entity.frame_id == frame_id
+    assert caption_entity.timestamp == timestamp
+    assert caption_entity.caption == caption
+
+    caption_repo.delete(caption_id)
+    with pytest.raises(NotFoundException):
+        caption_repo.read(caption_id)
+
+
+def test_save_all_captions(db_session, dummy_caption):
+    """Tests saving all captions."""
+    captions, frame_ids, timestamps = [], [], []
+    for _ in range(3):
+        caption, frame_id, timestamp = dummy_caption
+        captions.append(caption)
+        frame_ids.append(frame_id)
+        timestamps.append(timestamp)
+    model_name = "blip2"
+    media_id = "test_media_id_all"
+
+    caption_repo = ExtendedVideoCaptionRepository(db_session)
+    caption_entities = caption_repo.save_all(
+        model_name=model_name,
+        media_id=media_id,
+        captions=captions,
+        timestamps=timestamps,
+        frame_ids=frame_ids,
+    )
+    assert len(caption_entities) == len(captions)
+
+    caption_ids = [caption_entity.id for caption_entity in caption_entities]
+    for caption_id, caption, frame_id, timestamp in zip(
+        caption_ids, captions, frame_ids, timestamps, strict=True
+    ):
+        caption_entity = caption_repo.read(caption_id)
+
+        assert caption_entity.model == model_name
+        assert caption_entity.media_id == media_id
+        assert caption_entity.frame_id == frame_id
+        assert caption_entity.timestamp == timestamp
+        assert caption_entity.caption == caption
+
+    # delete all captions
+    for caption_id in caption_ids:
+        caption_repo.delete(caption_id)
+        with pytest.raises(NotFoundException):
+            caption_repo.read(caption_id)
+
+
+def test_get_captions(db_session, dummy_caption):
+    """Tests getting all captions."""
+    captions, frame_ids, timestamps = [], [], []
+    for _ in range(3):
+        caption, frame_id, timestamp = dummy_caption
+        captions.append(caption)
+        frame_ids.append(frame_id)
+        timestamps.append(timestamp)
+    model_name = "blip2"
+    media_id = "test_media_id_get_captions"
+
+    caption_repo = ExtendedVideoCaptionRepository(db_session)
+    caption_entities = caption_repo.save_all(
+        model_name=model_name,
+        media_id=media_id,
+        captions=captions,
+        timestamps=timestamps,
+        frame_ids=frame_ids,
+    )
+    assert len(caption_entities) == len(captions)
+
+    saved_captions = caption_repo.get_captions(model_name, media_id)
+
+    assert saved_captions["captions"] == captions
+    assert saved_captions["frame_ids"] == frame_ids
+    assert saved_captions["timestamps"] == timestamps
+
+    # delete all captions
+    for caption_entity in caption_entities:
+        caption_repo.delete(caption_entity.id)
+        with pytest.raises(NotFoundException):
+            caption_repo.read(caption_entity.id)
diff --git a/aana_chat_with_video/tests/test_extended_video_repo.py b/aana_chat_with_video/tests/test_extended_video_repo.py
new file mode 100644
index 0000000..4240334
--- /dev/null
+++ b/aana_chat_with_video/tests/test_extended_video_repo.py
@@ -0,0 +1,76 @@
+# ruff: noqa: S101
+
+import uuid
+from importlib import resources
+
+import pytest
+
+from aana.core.models.video import Video, VideoMetadata
+from aana.exceptions.db import MediaIdAlreadyExistsException, NotFoundException
+from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus
+from aana_chat_with_video.storage.repository.extended_video import (
+    ExtendedVideoRepository,
+)
+
+
+@pytest.fixture(scope="function")
+def dummy_video():
+    """Creates a dummy video for testing."""
+    media_id = str(uuid.uuid4())
+    path = resources.path("aana.tests.files.videos", "squirrel.mp4")
+    video = Video(
+        path=path, media_id=media_id, title="Squirrel", description="A squirrel video"
+    )
+    return video
+
+
+def test_save_video(db_session, dummy_video):
+    """Tests saving a video."""
+    video_repo = ExtendedVideoRepository(db_session)
+    video_repo.save(dummy_video, duration=10)
+
+    video_entity = video_repo.read(dummy_video.media_id)
+    assert video_entity
+    assert video_entity.id == dummy_video.media_id
+
+    # Try to save the same video again
+    with pytest.raises(MediaIdAlreadyExistsException):
+        video_repo.save(dummy_video)
+
+    video_repo.delete(dummy_video.media_id)
+    with pytest.raises(NotFoundException):
+        video_repo.read(dummy_video.media_id)
+
+
+def test_get_metadata(db_session, dummy_video):
+    """Tests getting video metadata."""
+    video_repo = ExtendedVideoRepository(db_session)
+    video_repo.save(dummy_video, duration=10)
+
+    metadata = video_repo.get_metadata(dummy_video.media_id)
+    assert isinstance(metadata, VideoMetadata)
+    assert metadata.title == dummy_video.title
+    assert metadata.description == dummy_video.description
+    assert metadata.duration == 10
+
+    video_repo.delete(dummy_video.media_id)
+    with pytest.raises(NotFoundException):
+        video_repo.get_metadata(dummy_video.media_id)
+
+
+def test_status(db_session, dummy_video):
+    """Tests getting and updating video status."""
+    video_repo = ExtendedVideoRepository(db_session)
+    video_repo.save(dummy_video, duration=10)
+
+    assert video_repo.get_status(dummy_video.media_id) == VideoProcessingStatus.CREATED
+
+    video_repo.update_status(dummy_video.media_id, VideoProcessingStatus.RUNNING)
+
+    assert video_repo.get_status(dummy_video.media_id) == VideoProcessingStatus.RUNNING
+
+    video_repo.delete(dummy_video.media_id)
+
+    with pytest.raises(NotFoundException):
+        video_repo.get_status(dummy_video.media_id)
+        video_repo.update_status(dummy_video.media_id, VideoProcessingStatus.COMPLETED)
diff --git a/aana_chat_with_video/tests/test_extended_video_transcript_repo.py b/aana_chat_with_video/tests/test_extended_video_transcript_repo.py
new file mode 100644
index 0000000..41fe3cd
--- /dev/null
+++ b/aana_chat_with_video/tests/test_extended_video_transcript_repo.py
@@ -0,0 +1,87 @@
+# ruff: noqa: S101
+
+import pytest
+
+from aana.core.models.asr import AsrSegment, AsrTranscription, AsrTranscriptionInfo
+from aana.core.models.time import TimeInterval
+from aana.exceptions.db import NotFoundException
+from aana.storage.models.transcript import TranscriptEntity
+from aana_chat_with_video.storage.repository.extended_video_transcript import (
+    ExtendedVideoTranscriptRepository,
+)
+
+transcript_entity = TranscriptEntity.from_asr_output(
+    model_name="whisper",
+    transcription=AsrTranscription(text="This is a transcript"),
+    segments=[],
+    info=AsrTranscriptionInfo(),
+)
+
+
+@pytest.fixture(scope="function")
+def dummy_transcript():
+    """Creates a dummy transcript for testing."""
+    transcript = AsrTranscription(text="This is a transcript")
+    segments = [
+        AsrSegment(text="This is a segment", time_interval=TimeInterval(start=0, end=1))
+    ]
+    info = AsrTranscriptionInfo(language="en", language_confidence=0.9)
+    return transcript, segments, info
+
+
+def test_save_transcript(db_session, dummy_transcript):
+    """Tests saving a transcript."""
+    transcript, segments, info = dummy_transcript
+    model_name = "whisper"
+    media_id = "test_media_id"
+
+    transcript_repo = ExtendedVideoTranscriptRepository(db_session)
+    transcript_entity = transcript_repo.save(
+        model_name=model_name,
+        media_id=media_id,
+        transcription_info=info,
+        transcription=transcript,
+        segments=segments,
+    )
+
+    transcript_id = transcript_entity.id
+
+    transcript_entity = transcript_repo.read(transcript_id)
+    assert transcript_entity
+    assert transcript_entity.id == transcript_id
+    assert transcript_entity.media_id == media_id
+    assert transcript_entity.model == model_name
+    assert transcript_entity.transcript == transcript.text
+    assert len(transcript_entity.segments) == len(segments)
+    assert transcript_entity.language == info.language
+    assert transcript_entity.language_confidence == info.language_confidence
+
+    transcript_repo.delete(transcript_id)
+    with pytest.raises(NotFoundException):
+        transcript_repo.read(transcript_id)
+
+
+def test_get_transcript(db_session, dummy_transcript):
+    """Tests getting a transcript."""
+    transcript, segments, info = dummy_transcript
+    model_name = "whisper"
+    media_id = "test_media_id"
+
+    transcript_repo = ExtendedVideoTranscriptRepository(db_session)
+    _ = transcript_repo.save(
+        model_name=model_name,
+        media_id=media_id,
+        transcription_info=info,
+        transcription=transcript,
+        segments=segments,
+    )
+
+    transcript = transcript_repo.get_transcript(model_name, media_id)
+    assert "transcription" in transcript
+    assert "segments" in transcript
+    assert "transcription_info" in transcript
+
+    assert isinstance(transcript["transcription"], AsrTranscription)
+    assert isinstance(transcript["segments"], list)
+    assert all(isinstance(s, AsrSegment) for s in transcript["segments"])
+    assert isinstance(transcript["transcription_info"], AsrTranscriptionInfo)
diff --git a/notebooks/chat_with_video_demo.ipynb b/notebooks/chat_with_video_demo.ipynb
new file mode 100644
index 0000000..e7fb757
--- /dev/null
+++ b/notebooks/chat_with_video_demo.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Chat with Video Demo\n",
+    "The notebook demonstrates how to run and use example chat with video application, a multimodal chat application that allows users to upload a video and ask questions about the video content based on the visual and audio information."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first step is to start the application. This is done by running the following command:\n",
+    "\n",
+    "```bash\n",
+    "HF_TOKEN=\"<your token>\" CUDA_VISIBLE_DEVICES=\"0\" aana deploy aana.projects.chat_with_video.app:aana_app\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[API documentation](https://127.0.0.1:8000/redoc) describes what endpoints are available.\n",
+    "\n",
+    "A video can be sent to the indexing endpoint `/video/index_stream`.\n",
+    "This will extract the audio transcription and captions for the frames and store them in the database for later retrieval. The following code snippet shows how to send the video to the indexing endpoint.\n",
+    "\n",
+    "The indexing endpoint is a streaming endpoint which means that it responds with a stream of data. Set `stream=True` in the request to get the response as a stream. The endpoint return captions and audio transcription as soon as they are available which allows you to create more responsive applications by showing the captions as they are generated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'media_id': 'wxN1T1uxQ2g', 'metadata': {'title': 'Everything Everywhere All At Once | Official Trailer HD | A24', 'description': \"SUBSCRIBE: http://bit.ly/A24subscribe\\r\\n\\r\\nA film from Daniels and starring Michelle Yeoh, Ke Huy Quan, Stephanie Hsu, Jenny Slate, Harry Shum Jr., with James Hong and Jamie Lee Curtis. EVERYTHING EVERYWHERE ALL AT ONCE – In Theaters March 25, 2022! #EverythingEverywhere\\r\\n\\r\\nHow to Watch: https://bit.ly/EEAAO-OnDemand\\n\\nRELEASE DATE: March 25, 2022\\r\\nDIRECTOR: Daniels\\r\\nCAST: Michelle Yeoh, Stephanie Hsu, Ke Huy Quan, Jenny Slate, Harry Shum Jr., with James Hong and Jamie Lee Curtis\\r\\n\\r\\nFollow Everything Everywhere on Instagram: https://bit.ly/EverythingEverywhere_IG\\r\\nFollow Everything Everywhere on Twitter: https://bit.ly/EverythingEverywhere_TW\\r\\nLike Everything Everywhere on Facebook: https://bit.ly/EverythingEverywhere_FB\\r\\n\\r\\n------\\r\\n\\r\\nABOUT A24:\\r\\nThe studio behind MOONLIGHT, LADY BIRD, EX MACHINA, THE WITCH, EIGHTH GRADE, HEREDITARY, THE FAREWELL, UNCUT GEMS, MINARI & more.\\r\\n\\r\\nComing Soon: The Tragedy of Macbeth, Everything Everywhere All At Once, X\\r\\n\\r\\nSubscribe to A24's NEWSLETTER:  http://bit.ly/A24signup\\r\\nVisit A24 WEBSITE: http://bit.ly/A24filmsdotcom\\r\\nLike A24 on FACEBOOK: http://bit.ly/FBA24\\r\\nFollow A24 on TWITTER: http://bit.ly/TweetA24\\r\\nFollow A24 on INSTAGRAM: http://bit.ly/InstaA24\"}}\n",
+      "{'transcription': {'text': ' Mrs. Wang. Mrs. Wang. Mrs. Wang, are you with us? I am paying attention. Now you may only see a pile of receipts, but I see a story. I can see where this story is going. It does not look.'}, 'segments': [{'text': ' Mrs. Wang. Mrs. Wang. Mrs. Wang, are you with us? I am paying attention. Now you may only see a pile of receipts, but I see a story. I can see where this story is going. It does not look.', 'time_interval': {'start': 2.929, 'end': 26.798}, 'confidence': None, 'no_speech_confidence': None, 'words': []}], 'info': {'language': 'en', 'language_confidence': 0.98486328125}}\n",
+      "{'transcription': {'text': \" What's happening? Evelyn, I'm not your husband. I'm another version of us from another universe. I'm here because we need your help. I'm very busy today. I don't have time to help you. Across the multiverse, I've seen thousands of Evelyn's.\"}, 'segments': [{'text': \" What's happening? Evelyn, I'm not your husband. I'm another version of us from another universe. I'm here because we need your help. I'm very busy today. I don't have time to help you. Across the multiverse, I've seen thousands of Evelyn's.\", 'time_interval': {'start': 31.99, 'end': 58.709}, 'confidence': None, 'no_speech_confidence': None, 'words': []}], 'info': {'language': 'en', 'language_confidence': 0.98486328125}}\n",
+      "{'transcription': {'text': \" Evil is spreading throughout the mini-verses. And you... may be your only chance of stopping it. Don't make me fight you. I am really good. I don't believe you. Wow, that was really good.\"}, 'segments': [{'text': \" Evil is spreading throughout the mini-verses. And you... may be your only chance of stopping it. Don't make me fight you. I am really good. I don't believe you. Wow, that was really good.\", 'time_interval': {'start': 67.639, 'end': 93.47}, 'confidence': None, 'no_speech_confidence': None, 'words': []}], 'info': {'language': 'en', 'language_confidence': 0.98486328125}}\n",
+      "{'transcription': {'text': ' The universe is so much bigger than you realize. Of all the places I could be, I just want to share with you. Remember our mission concerning the fate of every single world of our infinite multiverse.'}, 'segments': [{'text': ' The universe is so much bigger than you realize. Of all the places I could be, I just want to share with you. Remember our mission concerning the fate of every single world of our infinite multiverse.', 'time_interval': {'start': 97.093, 'end': 126.491}, 'confidence': None, 'no_speech_confidence': None, 'words': []}], 'info': {'language': 'en', 'language_confidence': 0.98486328125}}\n",
+      "{'transcription': {'text': \" There is no way I am the Evelyn you are looking for. Every rejection, every disappointment has led you here. To this moment. Don't let anything distract you from it.\"}, 'segments': [{'text': \" There is no way I am the Evelyn you are looking for. Every rejection, every disappointment has led you here. To this moment. Don't let anything distract you from it.\", 'time_interval': {'start': 128.987, 'end': 148.88}, 'confidence': None, 'no_speech_confidence': None, 'words': []}], 'info': {'language': 'en', 'language_confidence': 0.98486328125}}\n",
+      "{'captions': [\"a black background with a white clock on it's face and a person standing in front of it in the dark. the clock is on the left side of the image\", \"a washing machine with a black and white image of a person inside it's door frame and a white door handle on the side of the machine. the image is taken from the inside of the machine\", 'a woman is standing in a store with a pink shirt on and a pink purse in her hand. she is looking at the camera with a smile on her face', 'a blurry image of a person sitting on a chair in a room with a chair and a person sitting on the chair next to it. The person is wearing a green shirt and white pants', \"a woman in an office with a stack of papers and a computer in the background - office stock videos & royalty-free footageI'm not sure if this is a good thing or a bad thing.\", 'three people sitting at a desk with papers and papers on the table in front of them. one of the people is talking to the other two people. one of the people is wearing a brown shirt and the other is wearing a white shirt', 'a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo design by a2a logo', 'a woman standing in front of a desk full of papers and books on it. the woman is looking at the desk and the papers on it. the woman is looking at the desk and the papers on it'], 'timestamps': [0.0, 3.003000020980835, 3.6700000762939453, 6.25600004196167, 6.631999969482422, 8.550999641418457, 10.928000450134277, 12.512999534606934]}\n",
+      "{'captions': ['a woman sitting at a table with a laptop and a television in the background, looking at a computer screen and a television screen in the background, with a lot of clutter around her', 'an older woman wearing glasses and a yellow sweater in an office setting with a computer screen in the background and a phone on the desk in front of her face.', 'a woman and two men standing in front of a washing machine in a laundromat with a woman looking at the machines in the background. the woman is wearing a pink shirt and a black jacket', 'a man and woman are walking down a hallway with a bottle of water in their hands and a bottle of water on the floor in front of them. the man is holding a cane', \"a woman is holding onto a man's arm in a hallway with a woman in a blue shirt and a man in a white shirt standing next to her in the background\", \"a woman in a laundromat looking at something on the counter next to her in the background of a scene in a movie scene in the movie 'the great wall'\", 'a woman in a blue shirt is sitting in front of a washing machine in a laundromat. she is looking at the camera. the woman is wearing a denim jacket', 'a woman is standing in a room with a purple bag on the floor next to her and a woman standing behind her with a phone in her hand and a woman standing behind her with a phone in her hand'], 'timestamps': [14.013999938964844, 14.973999977111816, 16.808000564575195, 17.350000381469727, 17.93400001525879, 18.60099983215332, 19.768999099731445, 20.854000091552734]}\n",
+      "{'captions': ['a woman standing on a chair in a room with a television on the wall and a man standing on a chair in the corner of the room with a bunch of books', 'a woman is standing in front of a car in a parking lot with a blue car in the background and a blue car in the foreground with a blue car in the background', \"a woman is sitting in the driver's seat of a car with her eyes closed and her hands on the steering wheel, looking out the window at something in the distance\", 'a woman is working on a desk with papers and pens on it. a man is sitting at the desk next to her. there is a phone on the desk', 'an older woman wearing glasses and a yellow sweater in an office setting with a computer and papers on the desk in the background. the woman is looking at the camera', 'an asian woman sitting in an office with a headset on her head and a man sitting next to her with a headset on his head and a headset on his head', 'an older woman wearing glasses and a yellow sweater in an office setting with a computer and a phone on the desk in the background. she is looking at something on the screen', 'a woman is talking to a man in an office cubicle with other people around her. the woman is wearing a yellow sweater and has a yellow scarf around her neck'], 'timestamps': [21.479000091552734, 21.89699935913086, 22.731000900268555, 23.690000534057617, 24.482999801635742, 25.775999069213867, 26.985000610351562, 27.986000061035156]}\n",
+      "{'captions': ['a woman is sitting on an office chair and throwing her arms up in the air in the middle of an office space with other people in the background. the woman is wearing a purple shirt and purple pants', 'a woman with long hair and a face covered in blood is standing in a room with a light on in the background. the woman is holding her hands up in the air', 'a woman is looking up at the camera in an office building with a blurry background behind her. the woman is wearing a red shirt and has her hair pulled back', 'a woman is looking at her reflection in a mirror in an office building with a man standing behind her in the background. the woman is wearing a red shirt and has her hair pulled back', 'a woman is looking at a computer screen in an office building with a blurry image of her face on it. the woman is wearing a red shirt and has a red scarf', 'a film from danielles on a black background with the words a film from danielles on it in white letters on it in black letters on it in black letters', 'a man and woman are sitting in an airport terminal together with a man in glasses and a woman in glasses sitting next to them on a bench in the background.', 'a man is talking to another person in a kitchen area with a knife in his hand and a knife in his hand in the background. the man is holding a knife in his hand'], 'timestamps': [29.11199951171875, 31.11400032043457, 31.947999954223633, 33.03300094604492, 38.45500183105469, 38.49700164794922, 42.54199981689453, 44.12699890136719]}\n",
+      "{'captions': [\"a woman is talking to a man in a kitchen scene in the movie 'the dark tower' 2018-12-10-1-1-1-1\", 'a man and woman standing in a room with shelves and bottles on the wall behind them, one of them is looking at the other one with a smile on their face', \"a man and woman in a room with shelves and bottles on the floor next to them, one of them is touching the other's shoulder and the other is looking at the camera\", 'a person looking at a computer screen in a dark room with a light on it in the background. the person is looking at the screen with a light on it', 'a network of dots and lines on a black background with a red dot in the center of the image. stock footage, animation, video, video stock footage, animation, video stock footage, animation, video stock footage, animation, video stock footage, animation, video stock footage, animation, video stock footage, animation,', 'a woman in red shirt looking at the camera in a movie scene with a red background and a red background behind her and a red background in the background behind her', 'a woman is screaming in the woods with trees in the background and a sun in the sky behind her. the woman is wearing a blue shirt and has her mouth open', 'a woman with her mouth open in a city street scene with a man in a red jacket and a woman in a red jacket standing next to her with her mouth open'], 'timestamps': [45.16999816894531, 46.42100143432617, 49.04899978637695, 52.051998138427734, 53.095001220703125, 53.92900085449219, 54.76300048828125, 54.93000030517578]}\n",
+      "{'captions': ['a woman in a hat holding a pizza in front of a street sign with a car in the background. the woman is smiling and holding a pizza in her hand', 'a woman with her mouth open in a forest scene with trees in the background and a tree in the foreground with a mouth open and a tree in the background with a mouth open', \"a woman is holding a knife in her hand in a scene from the movie 'the handmaid's tale' | hulu | hulu | hulu | hulu\", 'a woman with headphones on her face looking at something on the screen with her eyes closed and her mouth open in a smiley face. the woman is wearing a black shirt and black pants', 'a person standing in a hallway with their back to the camera, and another person standing in the same hallway with their back to the camera, and a third person standing in the hallway with their back to the camera', 'a blurry image of people in a restaurant with a blue blanket on the floor and a man in a blue shirt sitting on the floor in the background of the photo', 'a woman in a chef hat is preparing food in a restaurant kitchen with other people watching her work on the countertop in the background. the woman is wearing a blue apron and a chef hat', 'a woman sitting in the back seat of a car with her eyes closed and a light shining on her face. she is wearing a white dress and has her hair pulled back'], 'timestamps': [55.18000030517578, 55.43000030517578, 55.638999938964844, 57.93299865722656, 58.599998474121094, 59.058998107910156, 60.518001556396484, 61.22800064086914]}\n",
+      "{'captions': ['a vase with birds and flowers on it is on display in a store window.', 'a woman is holding a white bag in the woods with her hands outstretched and a tree in the background behind her. she is smiling and holding a white bag', \"a woman with long hair and a red shirt is looking at the camera in a scene from the movie 'the witch' [2018] [hd] [720p]\", 'two women are fighting in a dark room with one of them holding a knife to her throat and the other one holding a gun to her head. one of the women is holding a knife to her throat and the other one is holding a gun to her head', 'the man is standing in front of a door in a room with a red sign on it. the man is looking at something on the wall behind him. the man is wearing a black shirt and black pants', 'a man in a police uniform is pointing a gun at someone in a hallway in a movie theater. the man is wearing a blue shirt and black pants. the man is holding a gun', 'a woman in an orange shirt and glasses walking through an office building with a computer screen in the background. the woman is looking at something on the screen. the woman is wearing glasses and a yellow shirt', 'a man in a plaid jacket standing in an office with people around him watching him talk to them on a phone or computer screen. the man is wearing a hat and a black jacket'], 'timestamps': [62.0620002746582, 62.3120002746582, 63.14699935913086, 65.44000244140625, 66.10700225830078, 67.0250015258789, 67.9010009765625, 69.0270004272461]}\n",
+      "{'captions': [\"a man in a hat standing next to a van with graffiti on it's side door and a cigarette in his mouth. the man is wearing a black hat and black jacket\", 'a man in a hat is sitting in a car with a camera on it and a truck behind him in the background. the truck is driving down the road and the man is driving', 'a man in a hat is holding a cigarette in front of a van with graffiti on it. the man is wearing a black hat and black jacket. the graffiti is on the side of the van', 'a man in a hoodie sitting at a bar with a gun in his hand and a bottle of alcohol in front of him. the man is looking at the camera', 'a woman is holding a knife while another woman is standing next to her holding a phone and a phone book in her hand. one of the women is holding a phone and a phone book', 'a man with pink hair talking to a woman in a hallway with a police officer behind him in the background. the man is wearing a police uniform and the woman is wearing a pink wig', \"the woman is being escorted by two police officers in a hallway in the movie 'the man from nowhere' | movieclips.com | movieclips.com\", 'a man with purple hair and a police officer in a hallway with a woman in a pink wig and purple hair on her head and a pink shirt on her back'], 'timestamps': [70.23600006103516, 70.52799987792969, 71.02899932861328, 71.15399932861328, 71.6969985961914, 72.697998046875, 73.74099731445312, 74.53299713134766]}\n",
+      "{'captions': ['a woman with pink hair standing next to a police officer in a room with confetti falling from the ceiling above her head and a man in a suit standing next to her', 'a woman is sitting in a hallway with her hands on her face and her eyes closed, looking at something on the floor in front of her. she is wearing a red shirt and has dark hair', 'a group of people on an escalator with one man holding a camera and another man holding a camera in the air and another man holding a camera in the air', 'a group of people on a stairway with one man holding a gun and another man holding a knife in his hand and another man holding a gun and another man holding a knife', 'two women are standing in a hallway with a white wall behind them and a door in the background, one of them is pointing at something with her hand and the other is pointing at something with her hand', 'a woman holding a knife in a bathroom with a man standing next to her holding a lighter in his hand and a woman standing next to him holding a lighter in her hand', 'a woman in a pink shirt and white apron is talking to another woman in a white shirt and pink apron in a hospital room with a door open and a plant', 'a group of people in a room with a man in a suit and a woman in a dress standing in front of them with a gun in her hand and a man in a suit standing behind them'], 'timestamps': [75.20099639892578, 75.70099639892578, 77.0770034790039, 78.87100219726562, 79.28800201416016, 80.83100128173828, 82.04000091552734, 82.58200073242188]}\n",
+      "{'captions': ['a woman is standing in front of a door with smoke coming out of it and her head is down on the floor next to her hand on the phone. the woman is looking at something on the phone', 'a blurry image of a man walking down the street with a skateboard in his hand and a skateboard in his hand on the sidewalk in front of him with a car in the background', 'the man and woman are standing in a room with smoke coming out of the windows and the man is looking at the woman with a cigarette in his mouth and the woman is looking at him', 'a group of people dancing in a room with smoke coming out of the windows and people standing in the middle of the room with their arms up in the air and one person dancing', 'a group of people in a room with a knife and a gun in the air, one person is holding a gun and another is holding a knife in the air', 'the man is standing in the smoke with his hands on his head and his hands are in the air as he walks through the smoke in the scene from guardians of the galaxy', 'a group of people dancing in a room with smoke coming out of the windows and a man in a suit holding a gun in his hand and a woman in a white dress', 'a group of people playing video games in a room with a green light on the wall behind them. one person is holding a controller and another is holding a controller'], 'timestamps': [83.54199981689453, 84.16699981689453, 84.95999908447266, 86.46099853515625, 86.75299835205078, 87.46199798583984, 87.83699798583984, 88.62999725341797]}\n",
+      "{'captions': ['two people in a room with smoke coming out of the windows and a man in the middle of the room with a knife in his hand and a gun in his hand', 'a man and woman are looking at something on a computer screen together in a room with a desk and a chair in the background. the man is wearing glasses and the woman is wearing a pink shirt', \"a person with tattoos on their hands is holding a hand up to someone else's hand in a dark room with a camera in the background. the person is wearing a white dress\", 'a woman with long hair is looking at the camera in a room with a mirror and a door in the background. the woman is pointing at something with her hand', \"a close up of a person's feet with a light shining on them from behind them. stock footage, video, and royalty-free footage, b-roll, and motion graphics\", 'the man in tuxedo is walking down the stairs in a crowded room with people watching him from the side of the room. there is a crowd of people in the background', 'a man in a tuxedo is walking down a hallway with people watching him from behind him and a woman in a dress is walking in front of him with a camera', 'a woman is standing in front of a laundry room with red lanterns hanging from the ceiling and a chinese flag hanging from the wall behind her. the woman is looking at a washing machine'], 'timestamps': [89.75599670410156, 92.21700286865234, 93.302001953125, 94.01100158691406, 94.84500122070312, 98.8489990234375, 99.0989990234375, 99.9749984741211]}\n",
+      "{'captions': ['a woman in a red dress standing in the woods with trees in the background and fog in the air around her. the woman is wearing a red cloak and has long hair', 'a person with blue nails is touching a metal object in a room with a person standing next to it. the person is wearing a plaid shirt and has a blue nail', 'a woman laying on the floor with her hands on her face and a man standing over her with a knife in his hand and a knife in his hand next to her', 'a blurry image of a person in a city street with their hands outstretched in front of them and a blurry background behind them. the image is blurry and blurry', 'a woman with her hands outstretched in the air in a garden scene in a movie scene with a man and a dog in the background, with the words \"the girl with the dragon tattoo\"', 'a man in a suit and tie sitting in the audience with popcorn and a woman in a black dress and glasses watching a movie with popcorn in her hand and a man in a suit and tie sitting next to her', 'a woman is sitting on a desk with papers and a television in the background. the woman is looking at something on the desk. the woman is looking at something on the desk', 'a man in a suit is playing with a remote control in an office building lobby area with a man on the floor and another man standing up in the air with his hands in the air'], 'timestamps': [102.97799682617188, 104.56300354003906, 106.77400207519531, 108.73400115966797, 109.7770004272461, 110.6520004272461, 111.11100006103516, 111.90299987792969]}\n",
+      "{'captions': ['a man with a beard and a tie is making a face with his mouth open in a room with a lot of people around him. he is wearing a white shirt and a tie', 'two men in suits are fighting in an office building with a man on the floor and a woman on the floor next to him with a knife in her hand and a man in a suit standing next to her', 'a woman peeking over the cubicle wall in an office building with other people in the background. the woman is looking at the camera. the woman is peeking over the cubicle wall', 'a man holding a woman up in the air at night on a street at night with lights on the street behind him and cars driving by him and lights on the street', 'a woman with short hair and a white shirt is looking at something on the countertop in a laundry room with a washing machine in the background. the woman is wearing a white shirt and black pants', 'two women are fighting in a room with a wooden door and a wooden cabinet in the background. one of the women is holding a knife and the other is holding a gun', 'a woman in red jacket dancing in a room with other people in the background. one of them is holding a gun. the other is holding a knife. the woman is wearing a red jacket', 'a woman in a red dress is standing in front of a camera and a man is sitting on a chair in the background of the photo shoot. the photo is blurry'], 'timestamps': [112.65399932861328, 113.27999877929688, 113.98899841308594, 114.947998046875, 115.8239974975586, 116.61699676513672, 117.24199676513672, 117.32599639892578]}\n",
+      "{'captions': ['a group of people in a room with a camera and a man in the middle of it all holding a camera up to the camera and pointing it at the camera', 'a woman sitting in the back seat of a car at night with her eyes closed and a smile on her face. the woman is looking at something in the distance', \"a woman in red is looking at the camera in a scene from the movie 'the year of the snake' in 2019, china, china, china, china, china, china, china, china, china, china, china, china, china,\", \"a man and woman standing in front of a restaurant window with chinese characters on the wall behind them. the man is holding the woman's hand and she is looking at him\", 'a woman is hugging another woman in front of a laundromat at night time with neon lights on the wall behind them. the woman is wearing a red shirt', 'two people are walking through a building with lights on them and plants in the background. one is holding a flashlight and the other is walking through the building. one is holding a flashlight and the other is walking through the building', 'a woman is sitting in a pool with plants around her and a man is standing next to her with a gun in his hand. the woman is looking at the camera', \"a man and woman in a kitchen talking to each other in the dark, with the man looking at the woman's face and the woman looking at the man's face\"], 'timestamps': [117.40899658203125, 117.90899658203125, 118.9520034790039, 119.8280029296875, 120.66200256347656, 121.66300201416016, 122.49700164794922, 124.54100036621094]}\n",
+      "{'captions': [\"a black background with a white clock on it's face and a black background behind it with a black clock on it's face and a black background behind it with a black clock on it's face\", 'a group of people standing in a church with white robes on them and one person is holding a cross in the air in front of them. they are all wearing white robes', 'two women in traditional clothing are standing in a hallway with red lights on them and a man in a white robe standing behind them with a red light on his head', 'two women in green hoods and white robes are standing in a hallway with green lights on them and a man in a white robe is standing behind them with a green light on his face', 'a dog in a cage with its mouth open and its eyes open wide in the dark room with the light shining on it. the dog is looking at the camera', 'a woman in a chinese costume with a red headpiece and a pink dress on her head and arms outstretched in front of her face with her eyes closed', 'a man is looking at something in a room with a shelf full of items on it. the man is smiling and looking at something on the shelf. the man is wearing a black shirt and has a black beard', 'a woman in green jacket standing in a dark alleyway at night with neon lights behind her and a neon sign in the background that says \"the night is dark and full of terrors\"'], 'timestamps': [125.66699981689453, 128.08599853515625, 129.04600524902344, 131.13099670410156, 132.3820037841797, 133.3000030517578, 134.00900268554688, 135.177001953125]}\n",
+      "{'captions': ['a man in a suit and tie walking down the street at night with lights in the background. the man is looking at something on the ground. the man is wearing glasses and a suit', 'a woman in an office with a computer and a man standing next to her in the middle of the room with a computer monitor in the background. the woman is standing on a desk', 'a bald man in a white shirt and tie holding a gun in his hand while another man is holding a gun in his hand next to him in a scene from the movie', 'a man in a hat and a gun in a room with other people in it. one of them is holding a gun and the other is holding a gun and a gun', \"an older man is looking at something on his phone in a scene from the movie 'the man from nowhere' | movieclips.com | movieclips.com\", 'a rock with eyes in the desert with a mountain in the background and a mountain in the background and a mountain in the background and a mountain in the background and a mountain in the background and a mountain in the background and a mountain in the background and a mountain in the background and a mountain in the background and a mountain', 'two women are standing in a room with candles and a lamp on the table in the background. one is holding a candle and the other is standing next to her', 'two women are dancing in a living room with candles lit up around them and a lamp on the table in the background. one of the women is wearing a white shirt and the other is wearing a black shirt'], 'timestamps': [135.968994140625, 136.97000122070312, 137.6790008544922, 138.13800048828125, 138.5970001220703, 139.63900756835938, 140.38999938964844, 141.14100646972656]}\n",
+      "{'captions': ['a woman with short hair and a white shirt is talking to someone in a room with candles and a lamp on the table in the background. the woman is holding a piece of food', \"a woman with a blue headband is looking at the camera in a scene from the movie 'the girl with the dragon tattoo' | movie stills | movie stills\", 'a woman is standing in front of a washing machine in a dark room with a man in the background looking at her backside. the woman is wearing a red shirt and black pants', 'a man in glasses is standing in a crowded room with other people in the background. he is looking at something on the ground. he is wearing a red sweater and a red tie', 'the man is holding a gun in his mouth in the movie, ghost in the shell 2: Innocence trailer 2, 2018, 1080p, 60fps', 'two women in traditional clothing are standing next to each other in a hallway with a woman in a white dress and a man in a white hat on the left side', 'apple and red apple in the air in a dark room with a blue ceiling and a blue wall behind it. the apple is upside down and the red apple is on top of it', 'an image of two planets in space with a bright light shining on them from behind them and a star in the background. stock video footage and royalty-free stock footage'], 'timestamps': [141.30799865722656, 142.01699829101562, 142.5590057373047, 143.05999755859375, 143.43499755859375, 143.8520050048828, 144.1439971923828, 144.85299682617188]}\n",
+      "{'captions': [\"a black background with a white clock on it's face and a person standing in front of it in the dark. the clock is on the left side of the image\", \"a woman with blood on her face and a knife in her hand in a scene from the movie, 'the grudge' - screenshot from the movie 'the grudge'\", \"a woman in a red jacket is looking at the camera in a city street scene in the movie 'the secret life of pets' - screenshot 1/2 - 0\", 'an asian woman in a blue shirt is looking at the camera in a scene from star trek online 2.0 trailer 2 - screenshot 1 - screenshot 2 - screenshot 3', 'a woman in a headscarf and another woman in a white dress and headscarf in the background of the scene with the woman in the headscarf', 'an asian woman in an office with headphones on and a red light on her face looking at the camera with a surprised look on her face. the woman is wearing a vest and a red shirt', 'a woman in a dark room with a light shining on her face and a man in the background looking at her with a smile on his face and a light shining on his face', 'a man in a tank top is looking at the camera in a dark room with a light bulb above him. he is wearing a black tank top and black shorts'], 'timestamps': [144.97799682617188, 149.4409942626953, 150.31700134277344, 150.44200134277344, 150.56700134277344, 150.69200134277344, 150.7760009765625, 150.85899353027344]}\n",
+      "{'captions': ['a woman with black hair standing in front of a city street with neon lights and people in the background. the woman is looking at the camera with a sad expression', 'two women in traditional clothing are standing in a hallway with other people in the background. one woman is wearing a head scarf and the other is wearing a head scarf and a veil', 'a woman is looking at something in the dark with candles in the background and a candle in her hand in the foreground. the woman is wearing a white shirt and has short hair', 'a woman with long black hair is looking at the camera in a red jacket and black shirt with trees in the background and water in the foreground and a lake in the background', 'a woman with her eyes closed in front of fire and smoke coming out of her mouth and eyes open in the air above her head and a red jacket on her', 'a woman with her eyes closed in front of a fire and smoke coming out of her mouth and eyes open in the air above her head and her hair is red', 'two people wearing gas masks and hoods in a room with a wall behind them and a window in front of them. one person is looking at the camera and the other is looking away', 'a woman wearing a hat and holding a pizza on the street corner with a truck behind her. the woman is holding a pizza in her hand and wearing a hat'], 'timestamps': [150.9429931640625, 151.0679931640625, 151.1929931640625, 151.4010009765625, 151.48500061035156, 151.5679931640625, 151.6929931640625, 151.8179931640625]}\n",
+      "{'captions': ['a woman is looking up at the camera while standing in front of trees and trees are in the background of the shot. the woman is wearing a blue shirt and has her eyes closed', 'a woman in a chinese costume singing and dancing in front of a mirror with a red headpiece on her head and a red dress on her body with a red headband', \"a woman in a red jacket is looking at the camera in a city street scene in a movie trailer for the movie 'the secret life of pets' - screenshot\", 'two women in traditional clothing are standing in a hallway with other people in the background. one woman is wearing a head scarf and the other is wearing a head scarf and a veil', 'a man and woman standing in front of a red door with a man in a red shirt and a woman in a red sweater standing next to him in the background', 'a white and black background with a black and white image of a bird flying in the air with a note on it in the middle of the image with a white background', 'everything everywhere - the video game - screenshot thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail thumbnail'], 'timestamps': [152.02699279785156, 152.23500061035156, 152.48599243164062, 152.6529998779297, 152.81900024414062, 153.0279998779297, 153.9040069580078]}\n",
+      "{'transcription_id': 1, 'caption_ids': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "import requests\n",
+    "\n",
+    "video = {\n",
+    "    \"url\": \"https://www.youtube.com/watch?v=wxN1T1uxQ2g\",  # Video URL, Aana SDK supports URLs (including YouTube), file paths or even raw video data\n",
+    "    \"media_id\": \"wxN1T1uxQ2g\",  # Media ID, so we can ask questions about the video later by using this ID\n",
+    "}\n",
+    "\n",
+    "data = {\n",
+    "    \"video_params\": {\n",
+    "        \"fast_mode_enabled\": True,  # Enable fast mode, which only processes keyframes\n",
+    "    },\n",
+    "    \"video\": video,\n",
+    "}\n",
+    "\n",
+    "url = \"http://127.0.0.1:8000/video/index_stream\"\n",
+    "response = requests.post(url, data={\"body\": json.dumps(data)}, stream=True)\n",
+    "\n",
+    "for chunk in response.iter_content(chunk_size=None):\n",
+    "    json_data = json.loads(chunk)\n",
+    "    if \"error\" in json_data:\n",
+    "        print(json_data[\"error\"])\n",
+    "        print(json_data[\"message\"])\n",
+    "        print(json_data[\"stacktrace\"])\n",
+    "        break\n",
+    "    else:\n",
+    "        print(json_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once the video is indexed, you can start asking questions about the video content. The following code snippet shows how to send a question to the `/video/chat_stream` endpoint. The endpoint is also a streaming endpoint and returns tokens of the answer as it's being generated. This allows you to show the answer as it's being generated for a more responsive user experience."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Based on the analysis, this video appears to be a trailer for a film, specifically the official trailer for \"Everything Everywhere All At Once\" directed by Daniel Kwan and Daniel Scheinert. The visuals depict a multitude of scenes showcasing various characters and settings, which are likely meant to represent different parallel universes or alternate realities. The audio transcript contains cryptic and surreal statements that seem to tie into the film's theme of exploring the multiverse and alternate realities. The trailer appears to be building up to a mysterious and intriguing narrative that explores the idea of multiple versions of a person and the consequences of their choices."
+     ]
+    }
+   ],
+   "source": [
+    "media_id = video[\"media_id\"]\n",
+    "\n",
+    "question = (\n",
+    "    \"What is happening in this video? \"\n",
+    "    \"Focus on highlighting the story, the narrative, emotionally and thematically significant aspects. \"\n",
+    "    \"Do not use bulleted list in your answer.\"\n",
+    ")\n",
+    "\n",
+    "data = {\n",
+    "    \"question\": question,\n",
+    "    \"media_id\": media_id,\n",
+    "    \"sampling_params\": {\n",
+    "        \"temperature\": 0.9,\n",
+    "        \"max_tokens\": 1024,\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "url = \"http://127.0.0.1:8000/video/chat_stream\"\n",
+    "\n",
+    "response = requests.post(url, data={\"body\": json.dumps(data)}, stream=True)\n",
+    "for chunk in response.iter_content(chunk_size=None):\n",
+    "    chunk_dict = json.loads(chunk)\n",
+    "    if \"error\" in chunk_dict:\n",
+    "        print(chunk_dict[\"error\"])\n",
+    "        print(chunk_dict[\"message\"])\n",
+    "        print(chunk_dict[\"stacktrace\"])\n",
+    "        break\n",
+    "    token = chunk_dict[\"completion\"]\n",
+    "    print(token, end=\"\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "aana-vIr3-B0u-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
index ce6d657..6953766 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,13 @@
 [tool.poetry]
 name = "aana_chat_with_video"
 version = "0.1.0"
-description = ""
-authors = ["mobiusml"]
+description = "A multimodal chat application that allows users to upload a video and ask questions about the video content based on the visual and audio information"
+authors = ["Mobius Labs GmbH <dev@mobiuslabs.com>"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.10"
-
+aana = { git = "https://github.com/mobiusml/aana_sdk.git", branch = "remove_projects" }
 
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.29.4"

From 0f6af772d482897bece8b2c6d2a059b81082a953 Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Mon, 29 Jul 2024 10:44:29 +0000
Subject: [PATCH 6/9] Fixed tests

---
 .vscode/settings.json                  | 13 +++++++++++++
 aana_chat_with_video/tests/conftest.py |  8 +++++---
 2 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..0e36a60
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,13 @@
+{
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.analysis.packageIndexDepths": [
+        {
+            "name": "aana",
+            "depth": 10,
+        }
+    ],
+}
\ No newline at end of file
diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
index b6ef17a..7cba801 100644
--- a/aana_chat_with_video/tests/conftest.py
+++ b/aana_chat_with_video/tests/conftest.py
@@ -8,8 +8,6 @@
 
 import pytest
 import requests
-from pydantic import ValidationError
-
 from aana.api.api_generation import Endpoint
 from aana.configs.db import DbSettings, SQLiteConfig
 from aana.exceptions.runtime import EmptyMigrationsException
@@ -17,7 +15,8 @@
 from aana.storage.op import DbType
 from aana.storage.session import get_session
 from aana.utils.json import jsonify
-from aana_chat_with_video.configs.settings import settings
+from pydantic import ValidationError
+
 from aana_chat_with_video.storage.op import (
     run_alembic_migrations as run_app_alembic_migrations,
 )
@@ -35,11 +34,14 @@ def db_session():
 
     # Reload the settings to update the database path
     import aana.configs.settings
+
     import aana_chat_with_video.configs.settings
 
     importlib.reload(aana.configs.settings)
     importlib.reload(aana_chat_with_video.configs.settings)
 
+    from aana_chat_with_video.configs.settings import settings
+
     # Run migrations to set up the schema
     try:
         run_app_alembic_migrations(settings)

From fba88ebd9bf08769fb8a4425c3da00ee7ec4051d Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Mon, 29 Jul 2024 11:50:22 +0000
Subject: [PATCH 7/9] Fixed engine in db session fixture

---
 aana_chat_with_video/tests/conftest.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
index 7cba801..a82a103 100644
--- a/aana_chat_with_video/tests/conftest.py
+++ b/aana_chat_with_video/tests/conftest.py
@@ -8,15 +8,15 @@
 
 import pytest
 import requests
+from pydantic import ValidationError
+from sqlalchemy.orm import Session
+
 from aana.api.api_generation import Endpoint
 from aana.configs.db import DbSettings, SQLiteConfig
 from aana.exceptions.runtime import EmptyMigrationsException
 from aana.sdk import AanaSDK
 from aana.storage.op import DbType
-from aana.storage.session import get_session
 from aana.utils.json import jsonify
-from pydantic import ValidationError
-
 from aana_chat_with_video.storage.op import (
     run_alembic_migrations as run_app_alembic_migrations,
 )
@@ -34,7 +34,6 @@ def db_session():
 
     # Reload the settings to update the database path
     import aana.configs.settings
-
     import aana_chat_with_video.configs.settings
 
     importlib.reload(aana.configs.settings)
@@ -50,7 +49,8 @@ def db_session():
         run_app_alembic_migrations(settings)
 
     # Create a new session
-    with get_session() as session:
+    engine = settings.db_config.get_engine()
+    with Session(engine) as session:
         yield session
 
 

From 37fe8136f8db43ba868f8f6c3032a9c1598d7e09 Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Mon, 29 Jul 2024 13:45:09 +0000
Subject: [PATCH 8/9] Reuse more functionality of aana SDK

---
 ...69bf375e7_init.py => 5ad873484aa3_init.py} |  72 +----------
 .../d93a90261ee5_added_extended_video.py      |  65 ++++++++++
 aana_chat_with_video/storage/op.py            |  54 +--------
 aana_chat_with_video/tests/conftest.py        | 112 +-----------------
 4 files changed, 77 insertions(+), 226 deletions(-)
 rename aana_chat_with_video/alembic/versions/{7da69bf375e7_init.py => 5ad873484aa3_init.py} (76%)
 create mode 100644 aana_chat_with_video/alembic/versions/d93a90261ee5_added_extended_video.py

diff --git a/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py b/aana_chat_with_video/alembic/versions/5ad873484aa3_init.py
similarity index 76%
rename from aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
rename to aana_chat_with_video/alembic/versions/5ad873484aa3_init.py
index df110da..8983342 100644
--- a/aana_chat_with_video/alembic/versions/7da69bf375e7_init.py
+++ b/aana_chat_with_video/alembic/versions/5ad873484aa3_init.py
@@ -1,8 +1,8 @@
 """init.
 
-Revision ID: 7da69bf375e7
+Revision ID: 5ad873484aa3
 Revises: 
-Create Date: 2024-07-25 13:28:12.907560
+Create Date: 2024-07-25 13:09:44.450321
 
 """
 from collections.abc import Sequence
@@ -11,7 +11,7 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = "7da69bf375e7"
+revision: str = "5ad873484aa3"
 down_revision: str | None = None
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
@@ -222,78 +222,12 @@ def upgrade() -> None:
         sa.ForeignKeyConstraint(["id"], ["media.id"], name=op.f("fk_video_id_media")),
         sa.PrimaryKeyConstraint("id", name=op.f("pk_video")),
     )
-    op.create_table(
-        "extended_video",
-        sa.Column("id", sa.String(length=36), nullable=False),
-        sa.Column(
-            "duration", sa.Float(), nullable=True, comment="Video duration in seconds"
-        ),
-        sa.Column(
-            "status",
-            sa.Enum(
-                "CREATED",
-                "RUNNING",
-                "COMPLETED",
-                "FAILED",
-                name="videoprocessingstatus",
-            ),
-            nullable=False,
-            comment="Processing status",
-        ),
-        sa.ForeignKeyConstraint(
-            ["id"], ["video.id"], name=op.f("fk_extended_video_id_video")
-        ),
-        sa.PrimaryKeyConstraint("id", name=op.f("pk_extended_video")),
-    )
-    op.create_table(
-        "extended_video_caption",
-        sa.Column("id", sa.Integer(), nullable=False),
-        sa.Column(
-            "media_id",
-            sa.String(length=36),
-            nullable=False,
-            comment="Foreign key to video table",
-        ),
-        sa.ForeignKeyConstraint(
-            ["id"], ["caption.id"], name=op.f("fk_extended_video_caption_id_caption")
-        ),
-        sa.ForeignKeyConstraint(
-            ["media_id"],
-            ["extended_video.id"],
-            name=op.f("fk_extended_video_caption_media_id_extended_video"),
-        ),
-        sa.PrimaryKeyConstraint("id", name=op.f("pk_extended_video_caption")),
-    )
-    op.create_table(
-        "extended_video_transcript",
-        sa.Column("id", sa.Integer(), nullable=False),
-        sa.Column(
-            "media_id",
-            sa.String(length=36),
-            nullable=False,
-            comment="Foreign key to video table",
-        ),
-        sa.ForeignKeyConstraint(
-            ["id"],
-            ["transcript.id"],
-            name=op.f("fk_extended_video_transcript_id_transcript"),
-        ),
-        sa.ForeignKeyConstraint(
-            ["media_id"],
-            ["extended_video.id"],
-            name=op.f("fk_extended_video_transcript_media_id_extended_video"),
-        ),
-        sa.PrimaryKeyConstraint("id", name=op.f("pk_extended_video_transcript")),
-    )
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     """Downgrade database from this revision to previous."""
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_table("extended_video_transcript")
-    op.drop_table("extended_video_caption")
-    op.drop_table("extended_video")
     op.drop_table("video")
     op.drop_table("transcript")
     op.drop_table("tasks")
diff --git a/aana_chat_with_video/alembic/versions/d93a90261ee5_added_extended_video.py b/aana_chat_with_video/alembic/versions/d93a90261ee5_added_extended_video.py
new file mode 100644
index 0000000..cb187e8
--- /dev/null
+++ b/aana_chat_with_video/alembic/versions/d93a90261ee5_added_extended_video.py
@@ -0,0 +1,65 @@
+"""added extended video.
+
+Revision ID: d93a90261ee5
+Revises: 5ad873484aa3
+Create Date: 2024-07-29 12:41:04.976640
+
+"""
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = 'd93a90261ee5'
+down_revision: str | None = '5ad873484aa3'
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    """Upgrade database to this revision from previous."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('extended_video',
+    sa.Column('id', sa.String(length=36), nullable=False),
+    sa.Column('duration', sa.Float(), nullable=True, comment='Video duration in seconds'),
+    sa.Column('status', sa.Enum('CREATED', 'RUNNING', 'COMPLETED', 'FAILED', name='videoprocessingstatus'), nullable=False, comment='Processing status'),
+    sa.ForeignKeyConstraint(['id'], ['video.id'], name=op.f('fk_extended_video_id_video')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video'))
+    )
+    op.create_table('extended_video_caption',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'),
+    sa.ForeignKeyConstraint(['id'], ['caption.id'], name=op.f('fk_extended_video_caption_id_caption')),
+    sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_caption_media_id_extended_video')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_caption'))
+    )
+    op.create_table('extended_video_transcript',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'),
+    sa.ForeignKeyConstraint(['id'], ['transcript.id'], name=op.f('fk_extended_video_transcript_id_transcript')),
+    sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_transcript_media_id_extended_video')),
+    sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_transcript'))
+    )
+    with op.batch_alter_table('tasks', schema=None) as batch_op:
+        batch_op.alter_column('id',
+               existing_type=sa.NUMERIC(),
+               type_=sa.UUID(),
+               existing_nullable=False)
+
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade database from this revision to previous."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('tasks', schema=None) as batch_op:
+        batch_op.alter_column('id',
+               existing_type=sa.UUID(),
+               type_=sa.NUMERIC(),
+               existing_nullable=False)
+
+    op.drop_table('extended_video_transcript')
+    op.drop_table('extended_video_caption')
+    op.drop_table('extended_video')
+    # ### end Alembic commands ###
diff --git a/aana_chat_with_video/storage/op.py b/aana_chat_with_video/storage/op.py
index e2cea44..c6fc0df 100644
--- a/aana_chat_with_video/storage/op.py
+++ b/aana_chat_with_video/storage/op.py
@@ -1,55 +1,9 @@
-from pathlib import Path
-
-from alembic import command
-from alembic.config import Config
-
-from aana.exceptions.runtime import EmptyMigrationsException
-
-
-def get_alembic_config(
-    app_config, ini_file_path: Path, alembic_data_path: Path
-) -> Config:
-    """Produces an alembic config to run migrations programmatically."""
-    engine = app_config.db_config.get_engine()
-    alembic_config = Config(ini_file_path)
-    alembic_config.set_main_option("script_location", str(alembic_data_path))
-    config_section = alembic_config.get_section(alembic_config.config_ini_section, {})
-    config_section["sqlalchemy.url"] = engine.url
-
-    return alembic_config
+from aana.storage.op import run_alembic_migrations as run_alembic_migrations_aana
+from aana.utils.core import get_module_dir
 
 
 def run_alembic_migrations(settings):
     """Runs alembic migrations before starting up."""
-    # We need the path to aana/alembic and aana/alembic.ini
-    # This is a hack until we need something better.
-    current_path = Path(__file__)
-    aana_app_root = current_path.parent.parent  # go up two directories
-    ini_file_path = aana_app_root / "alembic.ini"
-    alembic_data_path = aana_app_root / "alembic"
-    if not alembic_data_path.exists():
-        raise RuntimeError("Alembic directory does not exist.")  # noqa: TRY003
-    versions_path = alembic_data_path / "versions"
-    # Check if the versions directory is empty (no .py files)
-    if not versions_path.exists() or not any(Path(versions_path).glob("*.py")):
-        raise EmptyMigrationsException()
-
-    alembic_config = get_alembic_config(settings, ini_file_path, alembic_data_path)
-    engine = settings.db_config.get_engine()
-    with engine.begin() as connection:
-        alembic_config.attributes["connection"] = connection
-        command.upgrade(alembic_config, "head")
-
-
-def drop_all_tables(settings):
-    """Drops all tables in the database."""
-    # TODO: only allow this in testing mode
-    current_path = Path(__file__)
-    aana_app_root = current_path.parent.parent  # go up two directories
-    ini_file_path = aana_app_root / "alembic.ini"
-    alembic_data_path = aana_app_root / "alembic"
-    if not alembic_data_path.exists():
-        raise RuntimeError("Alembic directory does not exist.")  # noqa: TRY003
+    root_path = get_module_dir("aana_chat_with_video")
 
-    alembic_config = get_alembic_config(settings, ini_file_path, alembic_data_path)
-    command.downgrade(alembic_config, "base")
+    run_alembic_migrations_aana(settings, root_path)
diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
index a82a103..d5aa0b4 100644
--- a/aana_chat_with_video/tests/conftest.py
+++ b/aana_chat_with_video/tests/conftest.py
@@ -1,20 +1,14 @@
 # ruff: noqa: S101
 import importlib
-import json
 import os
 import tempfile
 from pathlib import Path
-from typing import Any
 
 import pytest
-import requests
-from pydantic import ValidationError
 from sqlalchemy.orm import Session
 
-from aana.api.api_generation import Endpoint
 from aana.configs.db import DbSettings, SQLiteConfig
 from aana.exceptions.runtime import EmptyMigrationsException
-from aana.sdk import AanaSDK
 from aana.storage.op import DbType
 from aana.utils.json import jsonify
 from aana_chat_with_video.storage.op import (
@@ -54,106 +48,10 @@ def db_session():
         yield session
 
 
-def send_api_request(
-    endpoint: Endpoint,
-    app: AanaSDK,
-    data: dict[str, Any],
-    timeout: int = 30,
-) -> dict[str, Any] | list[dict[str, Any]]:
-    """Call an endpoint, handling both streaming and non-streaming responses."""
-    url = f"http://localhost:{app.port}{endpoint.path}"
-    payload = {"body": json.dumps(data)}
-
-    if endpoint.is_streaming_response():
-        output = []
-        with requests.post(url, data=payload, timeout=timeout, stream=True) as r:
-            for chunk in r.iter_content(chunk_size=None):
-                chunk_output = json.loads(chunk.decode("utf-8"))
-                output.append(chunk_output)
-                if "error" in chunk_output:
-                    return [chunk_output]
-        return output
-    else:
-        response = requests.post(url, data=payload, timeout=timeout)
-        return response.json()
-
-
-def verify_output(
-    endpoint: Endpoint,
-    response: dict[str, Any] | list[dict[str, Any]],
-    expected_error: str | None = None,
-) -> None:
-    """Verify the output of an endpoint call."""
-    is_streaming = endpoint.is_streaming_response()
-    ResponseModel = endpoint.get_response_model()
-    if expected_error:
-        error = response[0]["error"] if is_streaming else response["error"]
-        assert error == expected_error, response
-    else:
-        try:
-            if is_streaming:
-                for item in response:
-                    ResponseModel.model_validate(item, strict=True)
-            else:
-                ResponseModel.model_validate(response, strict=True)
-        except ValidationError as e:
-            raise AssertionError(  # noqa: TRY003
-                f"Validation failed. Errors:\n{e}\n\nResponse: {response}"
-            ) from e
-
-
 @pytest.fixture(scope="module")
-def app_setup():
-    """Setup Ray Serve app for testing."""
-    # Create a temporary database for testing
-    tmp_database_path = Path(tempfile.mkstemp(suffix=".db")[1])
-    db_config = DbSettings(
-        datastore_type=DbType.SQLITE,
-        datastore_config=SQLiteConfig(path=tmp_database_path),
-    )
-    os.environ["DB_CONFIG"] = jsonify(db_config)
-
-    # Reload the settings to update the database path
-    import aana.configs.settings
-
-    importlib.reload(aana.configs.settings)
-
-    # Start the app
-    from aana_chat_with_video.app import aana_app
-
-    aana_app.connect(port=8000, show_logs=True, num_cpus=10)
-    aana_app.migrate()
-    aana_app.deploy()
-
-    yield aana_app
-
+def app_setup(app_factory):
+    """Setup app for testing."""
+    app, tmp_database_path = app_factory("aana_chat_with_video.app", "aana_app")
+    yield app
     tmp_database_path.unlink()
-    aana_app.shutdown()
-
-
-@pytest.fixture(scope="module")
-def call_endpoint(app_setup):
-    """Call an endpoint and verify the output."""
-    aana_app: AanaSDK = app_setup
-
-    def _call_endpoint(
-        endpoint_path: str,
-        data: dict[str, Any],
-        expected_error: str | None = None,
-    ) -> dict[str, Any] | list[dict[str, Any]]:
-        endpoint = next(
-            (e for e in aana_app.endpoints.values() if e.path == endpoint_path), None
-        )
-        if endpoint is None:
-            raise ValueError(f"Endpoint with path {endpoint_path} not found")  # noqa: TRY003
-
-        response = send_api_request(endpoint=endpoint, app=aana_app, data=data)
-        verify_output(
-            endpoint=endpoint,
-            response=response,
-            expected_error=expected_error,
-        )
-
-        return response
-
-    return _call_endpoint
+    app.shutdown()

From 3409a746592c1fa45210b3b9dbdf3af0b76e353a Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Tue, 30 Jul 2024 10:55:06 +0000
Subject: [PATCH 9/9] Updated test fixtures

---
 aana_chat_with_video/tests/conftest.py | 58 +++++++++++++-------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/aana_chat_with_video/tests/conftest.py b/aana_chat_with_video/tests/conftest.py
index d5aa0b4..1e26a8b 100644
--- a/aana_chat_with_video/tests/conftest.py
+++ b/aana_chat_with_video/tests/conftest.py
@@ -1,16 +1,17 @@
 # ruff: noqa: S101
-import importlib
 import os
 import tempfile
-from pathlib import Path
 
 import pytest
 from sqlalchemy.orm import Session
 
 from aana.configs.db import DbSettings, SQLiteConfig
+from aana.configs.settings import settings as aana_settings
 from aana.exceptions.runtime import EmptyMigrationsException
-from aana.storage.op import DbType
+from aana.storage.op import DbType, run_alembic_migrations
+from aana.tests.conftest import app_factory, call_endpoint  # noqa: F401
 from aana.utils.json import jsonify
+from aana_chat_with_video.configs.settings import settings
 from aana_chat_with_video.storage.op import (
     run_alembic_migrations as run_app_alembic_migrations,
 )
@@ -19,33 +20,30 @@
 @pytest.fixture(scope="function")
 def db_session():
     """Creates a new database file and session for each test."""
-    tmp_database_path = Path(tempfile.mkstemp(suffix=".db")[1])
-    db_config = DbSettings(
-        datastore_type=DbType.SQLITE,
-        datastore_config=SQLiteConfig(path=tmp_database_path),
-    )
-    os.environ["DB_CONFIG"] = jsonify(db_config)
-
-    # Reload the settings to update the database path
-    import aana.configs.settings
-    import aana_chat_with_video.configs.settings
-
-    importlib.reload(aana.configs.settings)
-    importlib.reload(aana_chat_with_video.configs.settings)
-
-    from aana_chat_with_video.configs.settings import settings
-
-    # Run migrations to set up the schema
-    try:
-        run_app_alembic_migrations(settings)
-    except EmptyMigrationsException:
-        print("No versions found in the custom migrations. Using default migrations.")
-        run_app_alembic_migrations(settings)
-
-    # Create a new session
-    engine = settings.db_config.get_engine()
-    with Session(engine) as session:
-        yield session
+    with tempfile.NamedTemporaryFile(dir=settings.tmp_data_dir) as tmp:
+        db_config = DbSettings(
+            datastore_type=DbType.SQLITE,
+            datastore_config=SQLiteConfig(path=tmp.name),
+        )
+        os.environ["DB_CONFIG"] = jsonify(db_config)
+
+        settings.db_config = db_config
+        settings.db_config._engine = None
+        aana_settings.db_config = db_config
+        aana_settings.db_config._engine = None
+
+        try:
+            run_app_alembic_migrations(settings)
+        except EmptyMigrationsException:
+            print(
+                "No versions found in the custom migrations. Using default migrations."
+            )
+            run_alembic_migrations(settings)
+
+        # Create a new session
+        engine = settings.db_config.get_engine()
+        with Session(engine) as session:
+            yield session
 
 
 @pytest.fixture(scope="module")