From b8cb41384281acb278841331735ff4a4dd9d7ffd Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Tue, 18 Feb 2025 08:53:01 -0600
Subject: [PATCH 01/49] Add buffered consumer runtime implementation

---
 .../replays/consumers/buffered/__init__.py    |   0
 .../replays/consumers/buffered/consumer.py    |  45 +++++
 src/sentry/replays/consumers/buffered/lib.py  | 100 +++++++++++
 .../replays/consumers/buffered/platform.py    | 158 ++++++++++++++++++
 4 files changed, 303 insertions(+)
 create mode 100644 src/sentry/replays/consumers/buffered/__init__.py
 create mode 100644 src/sentry/replays/consumers/buffered/consumer.py
 create mode 100644 src/sentry/replays/consumers/buffered/lib.py
 create mode 100644 src/sentry/replays/consumers/buffered/platform.py

diff --git a/src/sentry/replays/consumers/buffered/__init__.py b/src/sentry/replays/consumers/buffered/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
new file mode 100644
index 00000000000000..761873edc1318a
--- /dev/null
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -0,0 +1,45 @@
+import dataclasses
+import time
+
+from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
+
+
+class Flusher:
+
+    def __init__(self, flags: dict[str, str]) -> None:
+        self.max_buffer_length = int(flags["max_buffer_length"])
+        self.max_buffer_wait = int(flags["max_buffer_wait"])
+        self.__last_flushed_at = time.time()
+
+    def can_flush(self) -> bool:
+        return (time.time() - self.max_buffer_wait) >= self.__last_flushed_at
+
+    def do_flush(self) -> None:
+        self.__last_flushed_at = time.time()
+        # TODO: upload the bytes or something.
+
+
+@dataclasses.dataclass
+class Item:
+    recording: bytes
+
+
+def init(flags: dict[str, str]) -> Model[Item]:
+    flusher = Flusher(flags)
+    return Model(
+        buffer=[],
+        can_flush=flusher.can_flush,
+        do_flush=flusher.do_flush,
+        offsets={},
+        retries=0,
+    )
+
+
+def process_message(message: bytes) -> Item:
+    return Item(recording=message)
+
+
+recording_consumer = buffering_runtime(
+    init_fn=init,
+    process_fn=process_message,
+)
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
new file mode 100644
index 00000000000000..7aece6f5012d8e
--- /dev/null
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -0,0 +1,100 @@
+"""Buffered RunTime implementation."""
+
+from collections.abc import Callable, MutableMapping
+from dataclasses import dataclass
+from functools import partial
+from typing import Generic, TypeVar
+
+from arroyo.types import Partition
+
+from sentry.replays.consumers.buffered.platform import Cmd, Commit, Nothing, RunTime, Task
+
+Item = TypeVar("Item")
+T = TypeVar("T")
+
+
+@dataclass
+class Model(Generic[Item]):
+    buffer: list[Item]
+    can_flush: Callable[[], bool]
+    do_flush: Callable[[], None]
+    offsets: MutableMapping[Partition, int]
+    retries: int
+
+
+class Msg(Generic[Item]):
+    pass
+
+
+@dataclass(frozen=True)
+class Append(Msg[Item]):
+    item: Item
+    offset: MutableMapping[Partition, int]
+
+
+class Committed(Msg[Item]):
+    pass
+
+
+class Flush(Msg[Item]):
+    pass
+
+
+def process(
+    process_fn: Callable[[bytes], Item],
+    model: Model[Item],
+    message: bytes,
+    offset: MutableMapping[Partition, int],
+) -> Msg[Item] | None:
+    return Append(item=process_fn(message), offset=offset)
+
+
+def init(
+    init_fn: Callable[[dict[str, str]], Model[Item]],
+    flags: dict[str, str],
+) -> tuple[Model[Item], Cmd[Msg[Item]]]:
+    return (init_fn(flags), Nothing())
+
+
+def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Item]] | None]:
+    match msg:
+        case Append(item=item, offset=offset):
+            model.buffer.append(item)
+            model.offsets.update(offset)
+            if model.can_flush():
+                return (model, Task(msg=Flush()))
+            else:
+                return (model, None)
+        case Flush():
+            # What should happen if we fail? If you raise an exception the platform will restart
+            # from the last checkpoint -- which is standard behavior. We could be more clever here
+            # and provide error handling facilities or we could accept that this problem gets too
+            # complicated to reasonably abstract and have developers implement their own buffering
+            # consumer.
+            model.do_flush()
+            model.buffer = []
+            return (model, Commit(msg=Committed(), offsets=model.offsets))
+        case Committed():
+            return (model, None)
+
+    # Satisfy mypy. Apparently we don't do exhaustiveness checks in our configuration.
+    return (model, None)
+
+
+def subscription(model: Model[Item]) -> Msg[Item] | None:
+    if model.can_flush():
+        return Flush()
+    else:
+        return None
+
+
+def buffering_runtime(
+    init_fn: Callable[[dict[str, str]], Model[Item]],
+    process_fn: Callable[[bytes], Item],
+) -> RunTime[Model[Item], Msg[Item]]:
+    return RunTime(
+        init=partial(init, init_fn),
+        process=partial(process, process_fn),
+        subscription=subscription,
+        update=update,
+    )
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
new file mode 100644
index 00000000000000..a1c93dd9156199
--- /dev/null
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -0,0 +1,158 @@
+from collections.abc import Callable, Mapping, MutableMapping
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Generic, TypeVar
+
+from arroyo.backends.kafka.consumer import KafkaPayload
+from arroyo.processing.strategies import (
+    CommitOffsets,
+    ProcessingStrategy,
+    ProcessingStrategyFactory,
+)
+from arroyo.types import Commit as ArroyoCommit
+from arroyo.types import Message, Partition, Value
+
+
+class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
+
+    def __init__(self, flags: dict[str, str], runtime: "RunTime[Model, Msg]") -> None:
+        self.flags = flags
+        self.runtime = runtime
+
+    def create_with_partitions(
+        self,
+        commit: ArroyoCommit,
+        partitions: Mapping[Partition, int],
+    ) -> ProcessingStrategy[KafkaPayload]:
+        return PlatformStrategy(commit=commit, flags=self.flags, runtime=self.runtime)
+
+
+class PlatformStrategy(ProcessingStrategy[KafkaPayload]):
+
+    def __init__(
+        self,
+        commit: ArroyoCommit,
+        flags: dict[str, str],
+        runtime: "RunTime[Model, Msg]",
+    ) -> None:
+        # The RunTime is made aware of the commit strategy. It will
+        # submit the partition, offset mapping it wants committed.
+        runtime.setup(flags, self.handle_commit_request)
+
+        self.__commit_step = CommitOffsets(commit)
+        self.__closed = False
+        self.__runtime = runtime
+
+    def handle_commit_request(self, offsets: Mapping[Partition, int]) -> None:
+        self.__commit_step.submit(Message(Value(None, offsets, datetime.now())))
+
+    def submit(self, message: Message[KafkaPayload]) -> None:
+        assert not self.__closed
+        self.__runtime.submit(message)
+
+    def poll(self) -> None:
+        assert not self.__closed
+        self.__commit_step.poll()
+
+    def close(self) -> None:
+        self.__closed = True
+        self.__commit_step.close()
+
+    def terminate(self) -> None:
+        self.__closed = True
+        self.__commit_step.terminate()
+
+    def join(self, timeout: float | None = None) -> None:
+        self.__commit_step.join(timeout)
+
+
+CommitProtocol = Callable[[Mapping[Partition, int]], None]
+Model = TypeVar("Model")
+Msg = TypeVar("Msg")
+
+
+class Cmd(Generic[Msg]):
+    pass
+
+
+@dataclass(frozen=True)
+class BackPressure(Cmd[Msg]):
+    pass
+
+
+@dataclass(frozen=True)
+class Commit(Cmd[Msg]):
+    msg: Msg
+    offsets: MutableMapping[Partition, int]
+
+
+class Nothing(Cmd[Msg]):
+    pass
+
+
+@dataclass(frozen=True)
+class Task(Cmd[Msg]):
+    msg: Msg
+
+
+class RunTime(Generic[Model, Msg]):
+
+    def __init__(
+        self,
+        init: Callable[[dict[str, str]], tuple[Model, Cmd[Msg] | None]],
+        process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None],
+        subscription: Callable[[Model], Msg | None],
+        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]],
+    ) -> None:
+        self.init = init
+        self.process = process
+        self.subscription = subscription
+        self.update = update
+
+        self.__commit: CommitProtocol | None = None
+        self.__model: Model | None = None
+
+    @property
+    def commit(self) -> CommitProtocol:
+        assert self.__commit is not None
+        return self.__commit
+
+    @property
+    def model(self) -> Model:
+        assert self.__model is not None
+        return self.__model
+
+    def poll(self) -> None:
+        self.__handle_msg(self.subscription(self.model))
+
+    def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None:
+        self.__commit = commit
+
+        model, cmd = self.init(flags)
+        self.__model = model
+        self.__handle_cmd(cmd)
+
+    def submit(self, message: Message[KafkaPayload]) -> None:
+        self.__handle_msg(self.process(self.model, message.payload.value, message.committable))
+
+    def __handle_msg(self, msg: Msg | None) -> None:
+        if msg:
+            model, cmd = self.update(self.model, msg)
+            self.__model = model
+            self.__handle_cmd(cmd)
+
+    def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
+        if cmd is None:
+            return None
+
+        match cmd:
+            case BackPressure():
+                # TODO
+                ...
+            case Commit(msg=msg, offsets=offsets):
+                self.commit(offsets)
+                return self.__handle_msg(msg)
+            case Nothing():
+                return None
+            case Task(msg=msg):
+                return self.__handle_msg(msg)

From d82c589b8d961508fb7aea6fef41ebcfcb108bbd Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Tue, 18 Feb 2025 20:27:34 -0600
Subject: [PATCH 02/49] Begin adding process logic

---
 .../replays/consumers/buffered/consumer.py    | 259 ++++++++++++++++--
 src/sentry/replays/consumers/buffered/lib.py  |   9 +-
 .../replays/usecases/ingest/dom_index.py      |   5 +-
 3 files changed, 246 insertions(+), 27 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 761873edc1318a..103864ef99089c 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,10 +1,96 @@
+from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
+
+# RunTime implementation.
+
+
+def init(flags: dict[str, str]) -> Model["Item"]:
+    buffer = Buffer(flags)
+    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
+
+
+def process_message(message: bytes) -> "Item" | None:
+    return _process_message(message)
+
+
+recording_consumer = buffering_runtime(
+    init_fn=init,
+    process_fn=process_message,
+)
+
+
+# Business-logic implementation.
 import dataclasses
+import logging
 import time
+import uuid
+import zlib
+from typing import cast
+
+import sentry_sdk
+from django.conf import settings
+from sentry_kafka_schemas.codecs import Codec, ValidationError
+from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
+
+from sentry.conf.types.kafka_definition import Topic, get_topic_codec
+from sentry.replays.lib.storage import (
+    RecordingSegmentStorageMeta,
+    make_recording_filename,
+    make_video_filename,
+    storage_kv,
+)
+from sentry.replays.usecases.ingest import (
+    RecordingIngestMessage,
+    _report_size_metrics,
+    process_headers,
+    recording_post_processor,
+    track_initial_segment_event,
+)
+from sentry.replays.usecases.ingest.dom_index import ReplayActionsEvent, _initialize_publisher
+from sentry.utils import json
+
+logger = logging.getLogger(__name__)
+
+RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
+
+
+@dataclasses.dataclass(frozen=True)
+class EventMetadata:
+    is_replay_video: bool
+    key_id: int | None
+    org_id: int
+    project_id: int
+    received: int
+    replay_id: str
+    retention_days: int
+    segment_id: int
+
+
+@dataclasses.dataclass(frozen=True)
+class Item:
+    action_events: list[ReplayActionsEvent]
+    key_name: str
+    metadata: EventMetadata
+    recording: bytes
+    video: bytes | None
+
+
+@dataclasses.dataclass(frozen=True)
+class FilePartRow:
+    key: str
+    range_start: int
+    range_stop: int
 
-from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
 
+@dataclasses.dataclass(frozen=True)
+class MergedBuffer:
+    action_events: list[ReplayActionsEvent]
+    billed_events: list[EventMetadata]
+    buffer: bytes
+    buffer_rows: list[FilePartRow]
+    remote_key: str
 
-class Flusher:
+
+class Buffer:
 
     def __init__(self, flags: dict[str, str]) -> None:
         self.max_buffer_length = int(flags["max_buffer_length"])
@@ -14,32 +100,163 @@ def __init__(self, flags: dict[str, str]) -> None:
     def can_flush(self) -> bool:
         return (time.time() - self.max_buffer_wait) >= self.__last_flushed_at
 
-    def do_flush(self) -> None:
+    def do_flush(self, model: Model[Item]) -> None:
+        merged_buffer = _merge_buffer(model.buffer)
+        _commit_merged_buffer(merged_buffer)
         self.__last_flushed_at = time.time()
-        # TODO: upload the bytes or something.
 
 
-@dataclasses.dataclass
-class Item:
-    recording: bytes
+def _merge_buffer(buffer: list[Item]) -> MergedBuffer:
+    action_events = []
+    billed_events: list[EventMetadata] = []
+    parts = b""
+    parts_rows = []
+    remote_key = uuid.uuid4().hex
+
+    for item in buffer:
+        # Extend the action events with whatever exists on the buffer item.
+        action_events.extend(item.action_events)
 
+        # Segment 0 events which are not replay-video are billed.
+        if item.metadata.segment_id == 0 and not item.metadata.is_replay_video:
+            billed_events.append(item.metadata)
 
-def init(flags: dict[str, str]) -> Model[Item]:
-    flusher = Flusher(flags)
-    return Model(
-        buffer=[],
-        can_flush=flusher.can_flush,
-        do_flush=flusher.do_flush,
-        offsets={},
-        retries=0,
+        # Append the recording and video if applicable to the zipped file.
+        meta = RecordingSegmentStorageMeta(
+            project_id=item.metadata.project_id,
+            replay_id=item.metadata.replay_id,
+            segment_id=item.metadata.segment_id,
+            retention_days=item.metadata.retention_days,
+        )
+
+        parts, part_row = _append_part(parts, item.recording, key=make_recording_filename(meta))
+        parts_rows.append(part_row)
+
+        if item.video:
+            parts, part_row = _append_part(parts, item.video, key=make_video_filename(meta))
+            parts_rows.append(part_row)
+
+    return MergedBuffer(
+        action_events=action_events,
+        billed_events=billed_events,
+        buffer=parts,
+        buffer_rows=parts_rows,
+        remote_key=remote_key,
     )
 
 
-def process_message(message: bytes) -> Item:
-    return Item(recording=message)
+def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]:
+    range_start = len(parts)
+    range_stop = range_start + len(part) - 1
+    return (parts + part, FilePartRow(key, range_start, range_stop))
 
 
-recording_consumer = buffering_runtime(
-    init_fn=init,
-    process_fn=process_message,
-)
+def _commit_merged_buffer(buffer: MergedBuffer) -> None:
+    # Empty buffer's are not committed.
+    if buffer.buffer == b"":
+        return None
+
+    # Upload recording.
+    storage_kv.set(buffer.remote_key, buffer.buffer)
+
+    # Write rows.
+    # TODO: bulk insert rows
+
+    # Emit billing.
+    for event in buffer.billed_events:
+        track_initial_segment_event(
+            event.org_id,
+            event.project_id,
+            event.replay_id,
+            event.key_id,
+            event.received,
+            event.is_replay_video,
+        )
+
+    # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The
+    # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed
+    # before we can consider this buffer fully flushed and commit our local offsets.
+    publisher = _initialize_publisher(asynchronous=True)
+    for event in buffer.action_events:
+        publisher.publish("ingest-replay-events", json.dumps(event))
+    publisher.flush()
+
+    return None
+
+
+def _process_message(message: bytes) -> Item | None:
+    transaction = sentry_sdk.start_transaction(
+        name="replays.consumer.process_recording",
+        op="replays.consumer",
+        custom_sampling_context={
+            "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0)
+        },
+    )
+
+    with transaction.start_child(
+        op="replays.consumers.buffered.process_message", name="ingest_recording"
+    ):
+        # set_tag("org_id", message.org_id)
+        # set_tag("project_id", message.project_id)
+
+        try:
+            message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message.payload.value)
+        except ValidationError:
+            logger.exception("Could not decode recording message.")
+            return None
+
+        message = RecordingIngestMessage(
+            replay_id=message_dict["replay_id"],
+            key_id=message_dict.get("key_id"),
+            org_id=message_dict["org_id"],
+            project_id=message_dict["project_id"],
+            received=message_dict["received"],
+            retention_days=message_dict["retention_days"],
+            payload_with_headers=cast(bytes, message_dict["payload"]),
+            replay_event=cast(bytes | None, message_dict.get("replay_event")),
+            replay_video=cast(bytes | None, message_dict.get("replay_video")),
+        )
+
+        try:
+            headers, compressed_segment = process_headers(message.payload_with_headers)
+        except Exception:
+            logger.exception("Recording headers could not be extracted %s", message.replay_id)
+            return None
+
+        # Segment is decompressed for further analysis. Packed format expects
+        # concatenated, uncompressed bytes.
+        try:
+            recording_segment = zlib.decompress(compressed_segment)
+            _report_size_metrics(len(compressed_segment), len(recording_segment))
+        except zlib.error:
+            if compressed_segment[0] == ord("["):
+                recording_segment = compressed_segment
+                compressed_segment = zlib.compress(compressed_segment)  # Save storage $$$
+            else:
+                logger.exception("Invalid recording body.")
+                return None
+
+        recording_post_processor(
+            message, headers, recording_segment, message.replay_event, transaction
+        )
+
+
+@sentry_sdk.trace
+def _decode_recording_message(message: bytes) -> RecordingIngestMessage | None:
+    try:
+        message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message)
+    except ValidationError:
+        logger.exception("Could not decode recording message.")
+        return None
+
+    return RecordingIngestMessage(
+        replay_id=message_dict["replay_id"],
+        key_id=message_dict.get("key_id"),
+        org_id=message_dict["org_id"],
+        project_id=message_dict["project_id"],
+        received=message_dict["received"],
+        retention_days=message_dict["retention_days"],
+        payload_with_headers=cast(bytes, message_dict["payload"]),
+        replay_event=cast(bytes | None, message_dict.get("replay_event")),
+        replay_video=cast(bytes | None, message_dict.get("replay_video")),
+    )
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index 7aece6f5012d8e..c6e52a7b29979b 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -19,7 +19,6 @@ class Model(Generic[Item]):
     can_flush: Callable[[], bool]
     do_flush: Callable[[], None]
     offsets: MutableMapping[Partition, int]
-    retries: int
 
 
 class Msg(Generic[Item]):
@@ -41,12 +40,14 @@ class Flush(Msg[Item]):
 
 
 def process(
-    process_fn: Callable[[bytes], Item],
+    process_fn: Callable[[bytes], Item | None],
     model: Model[Item],
     message: bytes,
     offset: MutableMapping[Partition, int],
 ) -> Msg[Item] | None:
-    return Append(item=process_fn(message), offset=offset)
+    item = process_fn(message)
+    if item:
+        return Append(item=item, offset=offset)
 
 
 def init(
@@ -90,7 +91,7 @@ def subscription(model: Model[Item]) -> Msg[Item] | None:
 
 def buffering_runtime(
     init_fn: Callable[[dict[str, str]], Model[Item]],
-    process_fn: Callable[[bytes], Item],
+    process_fn: Callable[[bytes], Item | None],
 ) -> RunTime[Model[Item], Msg[Item]]:
     return RunTime(
         init=partial(init, init_fn),
diff --git a/src/sentry/replays/usecases/ingest/dom_index.py b/src/sentry/replays/usecases/ingest/dom_index.py
index f80eb37abac26b..88ec9bb5f9635e 100644
--- a/src/sentry/replays/usecases/ingest/dom_index.py
+++ b/src/sentry/replays/usecases/ingest/dom_index.py
@@ -233,13 +233,14 @@ def _get_testid(container: dict[str, str]) -> str:
     )
 
 
-def _initialize_publisher() -> KafkaPublisher:
+def _initialize_publisher(asynchronous: bool = True) -> KafkaPublisher:
     global replay_publisher
 
     if replay_publisher is None:
         config = kafka_config.get_topic_definition(Topic.INGEST_REPLAY_EVENTS)
         replay_publisher = KafkaPublisher(
-            kafka_config.get_kafka_producer_cluster_options(config["cluster"])
+            kafka_config.get_kafka_producer_cluster_options(config["cluster"]),
+            asynchronous=asynchronous,
         )
 
     return replay_publisher

From e82ce0f84bba0393f1cc4a00cc730934a7469cab Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Wed, 19 Feb 2025 08:10:52 -0600
Subject: [PATCH 03/49] Add tracing to each component of recording processing

---
 src/sentry/replays/consumers/recording.py     |  69 +-----
 .../replays/consumers/recording_buffered.py   |  21 +-
 .../replays/usecases/ingest/__init__.py       | 219 ++++++++++--------
 .../replays/usecases/ingest/dom_index.py      |   4 +
 4 files changed, 141 insertions(+), 172 deletions(-)

diff --git a/src/sentry/replays/consumers/recording.py b/src/sentry/replays/consumers/recording.py
index 5951dacacefb46..f384f99edc860a 100644
--- a/src/sentry/replays/consumers/recording.py
+++ b/src/sentry/replays/consumers/recording.py
@@ -1,5 +1,4 @@
 import dataclasses
-import logging
 from collections.abc import Mapping
 from typing import Any
 
@@ -9,19 +8,11 @@
 from arroyo.processing.strategies.abstract import ProcessingStrategy, ProcessingStrategyFactory
 from arroyo.processing.strategies.commit import CommitOffsets
 from arroyo.types import Commit, Message, Partition
-from django.conf import settings
-from sentry_kafka_schemas.codecs import Codec, ValidationError
-from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 from sentry_sdk.tracing import Span
 
-from sentry.conf.types.kafka_definition import Topic, get_topic_codec
 from sentry.replays.usecases.ingest import ingest_recording
 from sentry.utils.arroyo import MultiprocessingPool, run_task_with_multiprocessing
 
-logger = logging.getLogger(__name__)
-
-RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
-
 
 @dataclasses.dataclass
 class MessageContext:
@@ -86,14 +77,11 @@ def create_with_partitions(
             )
         else:
             # By default we preserve the previous behavior.
-            return RunTask(
-                function=initialize_threaded_context,
-                next_step=RunTaskInThreads(
-                    processing_function=process_message_threaded,
-                    concurrency=self.num_threads,
-                    max_pending_futures=50,
-                    next_step=CommitOffsets(commit),
-                ),
+            return RunTaskInThreads(
+                processing_function=process_message,
+                concurrency=self.num_threads,
+                max_pending_futures=50,
+                next_step=CommitOffsets(commit),
             )
 
     def shutdown(self) -> None:
@@ -101,51 +89,6 @@ def shutdown(self) -> None:
             self.pool.close()
 
 
-def initialize_threaded_context(message: Message[KafkaPayload]) -> MessageContext:
-    """Initialize a Sentry transaction and unpack the message."""
-    # TODO-anton: remove sampled here and let traces_sampler decide
-    transaction = sentry_sdk.start_transaction(
-        name="replays.consumer.process_recording",
-        op="replays.consumer",
-        custom_sampling_context={
-            "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0)
-        },
-    )
-    isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
-    return MessageContext(message.payload.value, transaction, isolation_scope)
-
-
-def process_message_threaded(message: Message[MessageContext]) -> Any:
-    """Move the replay payload to permanent storage."""
-    context: MessageContext = message.payload
-
-    try:
-        message_dict: ReplayRecording = RECORDINGS_CODEC.decode(context.message)
-    except ValidationError:
-        # TODO: DLQ
-        logger.exception("Could not decode recording message.")
-        return None
-
-    ingest_recording(message_dict, context.transaction, context.isolation_scope)
-
-
 def process_message(message: Message[KafkaPayload]) -> Any:
     """Move the replay payload to permanent storage."""
-    # TODO-anton: remove sampled here and let traces_sampler decide
-    transaction = sentry_sdk.start_transaction(
-        name="replays.consumer.process_recording",
-        op="replays.consumer",
-        custom_sampling_context={
-            "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0)
-        },
-    )
-    isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
-
-    try:
-        message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message.payload.value)
-    except ValidationError:
-        # TODO: DLQ
-        logger.exception("Could not decode recording message.")
-        return None
-
-    ingest_recording(message_dict, transaction, isolation_scope)
+    ingest_recording(message.payload.value)
diff --git a/src/sentry/replays/consumers/recording_buffered.py b/src/sentry/replays/consumers/recording_buffered.py
index 40c104414883a1..a4f04d49706078 100644
--- a/src/sentry/replays/consumers/recording_buffered.py
+++ b/src/sentry/replays/consumers/recording_buffered.py
@@ -65,7 +65,8 @@
 )
 from sentry.replays.usecases.ingest import (
     LOG_SAMPLE_RATE,
-    process_headers,
+    DropSilently,
+    parse_headers,
     track_initial_segment_event,
 )
 from sentry.replays.usecases.ingest.dom_index import (
@@ -211,7 +212,10 @@ def has_exceeded_last_buffer_commit_time(self) -> bool:
         return time.time() >= self._buffer_next_commit_time
 
     def append(self, message: BaseValue[KafkaPayload]) -> None:
-        process_message(self, message.payload.value)
+        try:
+            process_message(self, message.payload.value)
+        except DropSilently:
+            pass
 
     def new(self) -> RecordingBuffer:
         return RecordingBuffer(
@@ -233,16 +237,9 @@ def process_message(buffer: RecordingBuffer, message: bytes) -> None:
             logger.exception("Could not decode recording message.")
             return None
 
-    try:
-        headers, compressed_segment = process_headers(
-            cast_payload_bytes(decoded_message["payload"])
-        )
-    except Exception:
-        # TODO: DLQ
-        logger.exception(
-            "Recording headers could not be extracted %s", decoded_message["replay_id"]
-        )
-        return None
+    headers, compressed_segment = parse_headers(
+        cast_payload_bytes(decoded_message["payload"]), decoded_message["replay_id"]
+    )
 
     # Segment is decompressed for further analysis. Packed format expects
     # concatenated, uncompressed bytes.
diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index 04bc1dbba4b1eb..af07134025858e 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -4,14 +4,17 @@
 import logging
 import zlib
 from datetime import datetime, timezone
-from typing import TypedDict, cast
+from typing import Any, TypedDict, cast
 
+import sentry_sdk
 import sentry_sdk.scope
+from django.conf import settings
+from sentry_kafka_schemas.codecs import Codec, ValidationError
 from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
-from sentry_sdk import Scope, set_tag
-from sentry_sdk.tracing import Span
+from sentry_sdk import set_tag
 
 from sentry import options
+from sentry.conf.types.kafka_definition import Topic, get_topic_codec
 from sentry.constants import DataCategory
 from sentry.logging.handlers import SamplingFilter
 from sentry.models.project import Project
@@ -20,7 +23,11 @@
     make_recording_filename,
     storage_kv,
 )
-from sentry.replays.usecases.ingest.dom_index import log_canvas_size, parse_and_emit_replay_actions
+from sentry.replays.usecases.ingest.dom_index import (
+    ReplayActionsEvent,
+    emit_replay_actions,
+    parse_replay_actions,
+)
 from sentry.replays.usecases.pack import pack
 from sentry.signals import first_replay_received
 from sentry.utils import json, metrics
@@ -29,11 +36,16 @@
 CACHE_TIMEOUT = 3600
 COMMIT_FREQUENCY_SEC = 1
 LOG_SAMPLE_RATE = 0.01
+RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
 
 logger = logging.getLogger("sentry.replays")
 logger.addFilter(SamplingFilter(LOG_SAMPLE_RATE))
 
 
+class DropSilently(Exception):
+    pass
+
+
 class ReplayRecordingSegment(TypedDict):
     id: str  # a uuid that individualy identifies a recording segment
     chunks: int  # the number of chunks for this segment
@@ -70,41 +82,39 @@ class RecordingIngestMessage:
     replay_video: bytes | None
 
 
-@metrics.wraps("replays.usecases.ingest.ingest_recording")
-def ingest_recording(
-    message_dict: ReplayRecording, transaction: Span, isolation_scope: Scope
-) -> None:
+def ingest_recording(message: bytes) -> None:
     """Ingest non-chunked recording messages."""
+    isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
+
     with sentry_sdk.scope.use_isolation_scope(isolation_scope):
-        with transaction.start_child(
-            op="replays.usecases.ingest.ingest_recording",
-            name="ingest_recording",
-        ):
-            message = RecordingIngestMessage(
-                replay_id=message_dict["replay_id"],
-                key_id=message_dict.get("key_id"),
-                org_id=message_dict["org_id"],
-                project_id=message_dict["project_id"],
-                received=message_dict["received"],
-                retention_days=message_dict["retention_days"],
-                payload_with_headers=cast(bytes, message_dict["payload"]),
-                replay_event=cast(bytes | None, message_dict.get("replay_event")),
-                replay_video=cast(bytes | None, message_dict.get("replay_video")),
-            )
-            _ingest_recording(message, transaction)
-
-
-def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> None:
+        sentry_sdk.start_transaction(
+            name="replays.consumer.process_recording",
+            op="replays.consumer",
+            custom_sampling_context={
+                "sample_rate": getattr(
+                    settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0
+                )
+            },
+        )
+
+        try:
+            _ingest_recording(message)
+        except DropSilently:
+            # The message couldn't be parsed for whatever reason. We shouldn't block the consumer
+            # so we ignore it.
+            pass
+
+
+@sentry_sdk.trace
+def _ingest_recording(message_bytes: bytes) -> None:
     """Ingest recording messages."""
+    message = parse_recording_message(message_bytes)
+
     set_tag("org_id", message.org_id)
     set_tag("project_id", message.project_id)
 
-    try:
-        headers, compressed_segment = process_headers(message.payload_with_headers)
-    except Exception:
-        # TODO: DLQ
-        logger.exception("Recording headers could not be extracted %s", message.replay_id)
-        return None
+    headers, compressed_segment = parse_headers(message.payload_with_headers, message.replay_id)
+    compressed_segment, recording_segment = decompress_segment(compressed_segment)
 
     # Normalize ingest data into a standardized ingest format.
     segment_data = RecordingSegmentStorageMeta(
@@ -114,19 +124,6 @@ def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> Non
         retention_days=message.retention_days,
     )
 
-    # Segment is decompressed for further analysis. Packed format expects
-    # concatenated, uncompressed bytes.
-    try:
-        recording_segment = zlib.decompress(compressed_segment)
-        _report_size_metrics(len(compressed_segment), len(recording_segment))
-    except zlib.error:
-        if compressed_segment[0] == ord("["):
-            recording_segment = compressed_segment
-            compressed_segment = zlib.compress(compressed_segment)  # Save storage $$$
-        else:
-            logger.exception("Invalid recording body.")
-            return None
-
     if message.replay_video:
         # Logging org info for bigquery
         logger.info(
@@ -157,7 +154,7 @@ def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> Non
     else:
         storage_kv.set(make_recording_filename(segment_data), compressed_segment)
 
-    recording_post_processor(message, headers, recording_segment, message.replay_event, transaction)
+    recording_post_processor(message, headers, recording_segment, message.replay_event)
 
     # The first segment records an accepted outcome. This is for billing purposes. Subsequent
     # segments are not billed.
@@ -171,9 +168,8 @@ def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> Non
             is_replay_video=message.replay_video is not None,
         )
 
-    transaction.finish()
-
 
+@sentry_sdk.trace
 def track_initial_segment_event(
     org_id: int,
     project_id: int,
@@ -229,14 +225,6 @@ def should_skip_billing(org_id: int, is_replay_video: bool) -> bool:
     return is_replay_video and org_id in options.get("replay.replay-video.billing-skip-org-ids")
 
 
-@metrics.wraps("replays.usecases.ingest.process_headers")
-def process_headers(bytes_with_headers: bytes) -> tuple[RecordingSegmentHeaders, bytes]:
-    recording_headers_json, recording_segment = bytes_with_headers.split(b"\n", 1)
-    recording_headers = json.loads(recording_headers_json)
-    assert isinstance(recording_headers.get("segment_id"), int)
-    return recording_headers, recording_segment
-
-
 def replay_recording_segment_cache_id(project_id: int, replay_id: str, segment_id: str) -> str:
     return f"{project_id}:{replay_id}:{segment_id}"
 
@@ -254,53 +242,18 @@ def _report_size_metrics(
         )
 
 
+@sentry_sdk.trace
 def recording_post_processor(
     message: RecordingIngestMessage,
     headers: RecordingSegmentHeaders,
     segment_bytes: bytes,
     replay_event_bytes: bytes | None,
-    transaction: Span,
 ) -> None:
     try:
-        with metrics.timer("replays.usecases.ingest.decompress_and_parse"):
-            parsed_segment_data = json.loads(segment_bytes)
-            parsed_replay_event = json.loads(replay_event_bytes) if replay_event_bytes else None
-
-        # Emit DOM search metadata to Clickhouse.
-        with transaction.start_child(
-            op="replays.usecases.ingest.parse_and_emit_replay_actions",
-            name="parse_and_emit_replay_actions",
-        ):
-            project = Project.objects.get_from_cache(id=message.project_id)
-            parse_and_emit_replay_actions(
-                retention_days=message.retention_days,
-                project=project,
-                replay_id=message.replay_id,
-                segment_data=parsed_segment_data,
-                replay_event=parsed_replay_event,
-                org_id=message.org_id,
-            )
-
-        # Log canvas mutations to bigquery.
-        log_canvas_size(
-            message.org_id,
-            message.project_id,
-            message.replay_id,
-            parsed_segment_data,
-        )
-
-        # Log # of rrweb events to bigquery.
-        logger.info(
-            "sentry.replays.slow_click",
-            extra={
-                "event_type": "rrweb_event_count",
-                "org_id": message.org_id,
-                "project_id": message.project_id,
-                "replay_id": message.replay_id,
-                "size": len(parsed_segment_data),
-            },
-        )
-
+        segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes)
+        actions_event = try_get_replay_actions(message, segment, replay_event)
+        if actions_event:
+            emit_replay_actions(actions_event)
     except Exception:
         logging.exception(
             "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s",
@@ -309,3 +262,75 @@ def recording_post_processor(
             message.replay_id,
             headers["segment_id"],
         )
+
+
+@sentry_sdk.trace
+def parse_recording_message(message: bytes) -> RecordingIngestMessage:
+    try:
+        message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message)
+    except ValidationError:
+        logger.exception("Could not decode recording message.")
+        raise DropSilently()
+
+    return RecordingIngestMessage(
+        replay_id=message_dict["replay_id"],
+        key_id=message_dict.get("key_id"),
+        org_id=message_dict["org_id"],
+        project_id=message_dict["project_id"],
+        received=message_dict["received"],
+        retention_days=message_dict["retention_days"],
+        payload_with_headers=cast(bytes, message_dict["payload"]),
+        replay_event=cast(bytes | None, message_dict.get("replay_event")),
+        replay_video=cast(bytes | None, message_dict.get("replay_video")),
+    )
+
+
+@sentry_sdk.trace
+def parse_headers(recording: bytes, replay_id: str) -> tuple[RecordingSegmentHeaders, bytes]:
+    try:
+        recording_headers_json, recording_segment = recording.split(b"\n", 1)
+        recording_headers = json.loads(recording_headers_json)
+        assert isinstance(recording_headers.get("segment_id"), int)
+        return recording_headers, recording_segment
+    except Exception:
+        logger.exception("Recording headers could not be extracted %s", replay_id)
+        raise DropSilently()
+
+
+@sentry_sdk.trace
+def decompress_segment(segment: bytes) -> tuple[bytes, bytes]:
+    try:
+        decompressed_segment = zlib.decompress(segment)
+        return segment, decompressed_segment
+    except zlib.error:
+        if segment[0] == ord("["):
+            compressed_segment = zlib.compress(segment)
+            return compressed_segment, segment
+        else:
+            logger.exception("Invalid recording body.")
+            raise DropSilently()
+
+
+@sentry_sdk.trace
+def parse_segment_and_replay_data(segment: bytes, replay_event: bytes | None) -> tuple[Any, Any]:
+    parsed_segment_data = json.loads(segment)
+    parsed_replay_event = json.loads(replay_event) if replay_event else None
+    return parsed_segment_data, parsed_replay_event
+
+
+@sentry_sdk.trace
+def try_get_replay_actions(
+    message: RecordingIngestMessage,
+    parsed_segment_data: Any,
+    parsed_replay_event: Any | None,
+) -> ReplayActionsEvent | None:
+    project = Project.objects.get_from_cache(id=message.project_id)
+
+    return parse_replay_actions(
+        project=project,
+        replay_id=message.replay_id,
+        retention_days=message.retention_days,
+        segment_data=parsed_segment_data,
+        replay_event=parsed_replay_event,
+        org_id=message.org_id,
+    )
diff --git a/src/sentry/replays/usecases/ingest/dom_index.py b/src/sentry/replays/usecases/ingest/dom_index.py
index f80eb37abac26b..27a1114fd98a9a 100644
--- a/src/sentry/replays/usecases/ingest/dom_index.py
+++ b/src/sentry/replays/usecases/ingest/dom_index.py
@@ -8,6 +8,8 @@
 from hashlib import md5
 from typing import Any, Literal, TypedDict
 
+import sentry_sdk
+
 from sentry import options
 from sentry.conf.types.kafka_definition import Topic
 from sentry.models.project import Project
@@ -77,11 +79,13 @@ def parse_and_emit_replay_actions(
             emit_replay_actions(message)
 
 
+@sentry_sdk.trace
 def emit_replay_actions(action: ReplayActionsEvent) -> None:
     publisher = _initialize_publisher()
     publisher.publish("ingest-replay-events", json.dumps(action))
 
 
+@sentry_sdk.trace
 def parse_replay_actions(
     project: Project,
     replay_id: str,

From 55cc3afae5878c1e5621ce074daad4cb6be7de47 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Wed, 19 Feb 2025 08:15:06 -0600
Subject: [PATCH 04/49] Delete unused function

---
 src/sentry/replays/usecases/ingest/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index af07134025858e..699ac534bc79be 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -225,10 +225,6 @@ def should_skip_billing(org_id: int, is_replay_video: bool) -> bool:
     return is_replay_video and org_id in options.get("replay.replay-video.billing-skip-org-ids")
 
 
-def replay_recording_segment_cache_id(project_id: int, replay_id: str, segment_id: str) -> str:
-    return f"{project_id}:{replay_id}:{segment_id}"
-
-
 def _report_size_metrics(
     size_compressed: int | None = None, size_uncompressed: int | None = None
 ) -> None:

From a243bdbd4760c2939f08d5c15d322764288b7e9f Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Wed, 19 Feb 2025 08:16:37 -0600
Subject: [PATCH 05/49] Report size metrics

---
 src/sentry/replays/usecases/ingest/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index af07134025858e..8d15bfecb56b30 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -115,6 +115,7 @@ def _ingest_recording(message_bytes: bytes) -> None:
 
     headers, compressed_segment = parse_headers(message.payload_with_headers, message.replay_id)
     compressed_segment, recording_segment = decompress_segment(compressed_segment)
+    _report_size_metrics(len(compressed_segment), len(recording_segment))
 
     # Normalize ingest data into a standardized ingest format.
     segment_data = RecordingSegmentStorageMeta(

From 890f17027047ea6243cda9a058f8c636a946906b Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Wed, 19 Feb 2025 10:13:36 -0600
Subject: [PATCH 06/49] Separate IO from processing

---
 .../replays/usecases/ingest/__init__.py       | 173 ++++++++++++------
 1 file changed, 120 insertions(+), 53 deletions(-)

diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index e72b1a81e784ae..444f590cfb1412 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -108,6 +108,30 @@ def ingest_recording(message: bytes) -> None:
 @sentry_sdk.trace
 def _ingest_recording(message_bytes: bytes) -> None:
     """Ingest recording messages."""
+    processed_recording = process_recording_message(message_bytes)
+    commit_recording_message(processed_recording)
+    track_recording_metadata(processed_recording)
+
+
+@dataclasses.dataclass
+class ProcessedRecordingMessage:
+    actions_event: ReplayActionsEvent | None
+    filedata: bytes
+    filename: str
+    is_replay_video: bool
+    key_id: int | None
+    org_id: int
+    project_id: int
+    received: int
+    recording_size_uncompressed: int
+    recording_size: int
+    replay_id: str
+    segment_id: int
+    video_size: int | None
+
+
+@sentry_sdk.trace
+def process_recording_message(message_bytes: bytes) -> ProcessedRecordingMessage:
     message = parse_recording_message(message_bytes)
 
     set_tag("org_id", message.org_id)
@@ -115,58 +139,103 @@ def _ingest_recording(message_bytes: bytes) -> None:
 
     headers, compressed_segment = parse_headers(message.payload_with_headers, message.replay_id)
     compressed_segment, recording_segment = decompress_segment(compressed_segment)
-    _report_size_metrics(len(compressed_segment), len(recording_segment))
 
-    # Normalize ingest data into a standardized ingest format.
-    segment_data = RecordingSegmentStorageMeta(
+    actions_event = parse_recording_actions(
+        message,
+        headers,
+        recording_segment,
+        message.replay_event,
+    )
+
+    filename = make_recording_filename(
+        RecordingSegmentStorageMeta(
+            project_id=message.project_id,
+            replay_id=message.replay_id,
+            segment_id=headers["segment_id"],
+            retention_days=message.retention_days,
+        )
+    )
+
+    if message.replay_video:
+        filedata = zlib.compress(pack(rrweb=recording_segment, video=message.replay_video))
+        video_size = len(message.replay_video)
+    else:
+        filedata = compressed_segment
+        video_size = None
+
+    return ProcessedRecordingMessage(
+        actions_event,
+        filedata,
+        filename,
+        is_replay_video=message.replay_video is not None,
+        key_id=message.key_id,
+        org_id=message.org_id,
         project_id=message.project_id,
+        received=message.received,
+        recording_size_uncompressed=len(recording_segment),
+        recording_size=len(compressed_segment),
         replay_id=message.replay_id,
         segment_id=headers["segment_id"],
-        retention_days=message.retention_days,
+        video_size=video_size,
     )
 
-    if message.replay_video:
+
+@sentry_sdk.trace
+def commit_recording_message(recording: ProcessedRecordingMessage) -> None:
+    # Write to GCS.
+    storage_kv.set(recording.filename, recording.filedata)
+
+    # Write to billing consumer if its a billable event.
+    if recording.segment_id == 0:
+        track_initial_segment_event(
+            recording.org_id,
+            recording.project_id,
+            recording.replay_id,
+            recording.key_id,
+            recording.received,
+            is_replay_video=recording.is_replay_video,
+        )
+
+    # Write to replay-event consumer.
+    if recording.actions_event:
+        emit_replay_actions(recording.actions_event)
+
+
+@sentry_sdk.trace
+def track_recording_metadata(recording: ProcessedRecordingMessage) -> None:
+    # Report size metrics to determine usage patterns.
+    _report_size_metrics(
+        size_compressed=recording.recording_size,
+        size_uncompressed=recording.recording_size_uncompressed,
+    )
+
+    if recording.video_size:
         # Logging org info for bigquery
         logger.info(
             "sentry.replays.slow_click",
             extra={
                 "event_type": "mobile_event",
-                "org_id": message.org_id,
-                "project_id": message.project_id,
-                "size": len(message.replay_video),
+                "org_id": recording.org_id,
+                "project_id": recording.project_id,
+                "size": len(recording.filedata),
             },
         )
 
-        # Record video size for COGS analysis.
+        # Track the number of replay-video events we receive.
         metrics.incr("replays.recording_consumer.replay_video_count")
+
+        # Record video size for COGS analysis.
         metrics.distribution(
             "replays.recording_consumer.replay_video_size",
-            len(message.replay_video),
+            recording.video_size,
             unit="byte",
         )
 
-        dat = zlib.compress(pack(rrweb=recording_segment, video=message.replay_video))
-        storage_kv.set(make_recording_filename(segment_data), dat)
-
-        # Track combined payload size.
+        # Track combined payload size for COGs analysis.
         metrics.distribution(
-            "replays.recording_consumer.replay_video_event_size", len(dat), unit="byte"
-        )
-    else:
-        storage_kv.set(make_recording_filename(segment_data), compressed_segment)
-
-    recording_post_processor(message, headers, recording_segment, message.replay_event)
-
-    # The first segment records an accepted outcome. This is for billing purposes. Subsequent
-    # segments are not billed.
-    if headers["segment_id"] == 0:
-        track_initial_segment_event(
-            message.org_id,
-            message.project_id,
-            message.replay_id,
-            message.key_id,
-            message.received,
-            is_replay_video=message.replay_video is not None,
+            "replays.recording_consumer.replay_video_event_size",
+            len(recording.filedata),
+            unit="byte",
         )
 
 
@@ -239,28 +308,6 @@ def _report_size_metrics(
         )
 
 
-@sentry_sdk.trace
-def recording_post_processor(
-    message: RecordingIngestMessage,
-    headers: RecordingSegmentHeaders,
-    segment_bytes: bytes,
-    replay_event_bytes: bytes | None,
-) -> None:
-    try:
-        segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes)
-        actions_event = try_get_replay_actions(message, segment, replay_event)
-        if actions_event:
-            emit_replay_actions(actions_event)
-    except Exception:
-        logging.exception(
-            "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s",
-            message.org_id,
-            message.project_id,
-            message.replay_id,
-            headers["segment_id"],
-        )
-
-
 @sentry_sdk.trace
 def parse_recording_message(message: bytes) -> RecordingIngestMessage:
     try:
@@ -308,6 +355,26 @@ def decompress_segment(segment: bytes) -> tuple[bytes, bytes]:
             raise DropSilently()
 
 
+@sentry_sdk.trace
+def parse_recording_actions(
+    message: RecordingIngestMessage,
+    headers: RecordingSegmentHeaders,
+    segment_bytes: bytes,
+    replay_event_bytes: bytes | None,
+) -> ReplayActionsEvent | None:
+    try:
+        segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes)
+        return try_get_replay_actions(message, segment, replay_event)
+    except Exception:
+        logging.exception(
+            "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s",
+            message.org_id,
+            message.project_id,
+            message.replay_id,
+            headers["segment_id"],
+        )
+
+
 @sentry_sdk.trace
 def parse_segment_and_replay_data(segment: bytes, replay_event: bytes | None) -> tuple[Any, Any]:
     parsed_segment_data = json.loads(segment)

From 2dcfaecfe254716edc196c42005734ca934966ca Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Wed, 19 Feb 2025 10:53:20 -0600
Subject: [PATCH 07/49] Add explicit return

---
 src/sentry/replays/usecases/ingest/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index 444f590cfb1412..ab3ab8901c18b5 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -373,6 +373,7 @@ def parse_recording_actions(
             message.replay_id,
             headers["segment_id"],
         )
+        return None
 
 
 @sentry_sdk.trace

From db5fa11ba714e6b64d23c603a3127779102d7264 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Wed, 19 Feb 2025 15:42:56 -0600
Subject: [PATCH 08/49] Add buffer managers

---
 .../consumers/buffered/buffer_managers.py     | 144 +++++++++
 .../replays/consumers/buffered/consumer.py    | 293 ++++--------------
 src/sentry/replays/consumers/buffered/lib.py  |   4 +-
 .../replays/usecases/ingest/__init__.py       |  44 ++-
 4 files changed, 241 insertions(+), 244 deletions(-)
 create mode 100644 src/sentry/replays/consumers/buffered/buffer_managers.py

diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py
new file mode 100644
index 00000000000000..c3504a278107a8
--- /dev/null
+++ b/src/sentry/replays/consumers/buffered/buffer_managers.py
@@ -0,0 +1,144 @@
+import dataclasses
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+
+import sentry_sdk
+
+from sentry.replays.lib.storage import storage_kv
+from sentry.replays.usecases.ingest import (
+    ProcessedRecordingMessage,
+    commit_recording_message,
+    emit_replay_actions,
+    track_initial_segment_event,
+    track_recording_metadata,
+)
+from sentry.replays.usecases.ingest.dom_index import _initialize_publisher
+
+
+@dataclasses.dataclass(frozen=True)
+class FilePartRow:
+    key: str
+    range_start: int
+    range_stop: int
+
+
+@dataclasses.dataclass(frozen=True)
+class MergedBuffer:
+    buffer: bytes
+    buffer_rows: list[FilePartRow]
+    remote_key: str
+
+
+class BatchedBufferManager:
+    """Batched buffer manager.
+
+    The batched buffer manager takes a list of messages and merges them into a single file before
+    committing them to permanent storage. We store filename and byte locations in a PostgreSQL
+    table.
+    """
+
+    def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
+        merged_buffer = self.merge_buffer(messages)
+        self.commit_merged_buffer(merged_buffer)
+        self.bulk_track_initial_segment_events(messages)
+        self.bulk_emit_action_events(messages)
+        self.bulk_track_recording_metadata(messages)
+
+    @sentry_sdk.trace
+    def merge_buffer(self, buffer: list[ProcessedRecordingMessage]) -> MergedBuffer:
+        def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]:
+            range_start = len(parts)
+            range_stop = range_start + len(part) - 1
+            return (parts + part, FilePartRow(key, range_start, range_stop))
+
+        parts = b""
+        parts_rows = []
+        remote_key = uuid.uuid4().hex
+
+        for item in buffer:
+            # The recording data is appended to the buffer and a row for tracking is
+            # returned.
+            parts, part_row = _append_part(parts, item.filedata, key=item.filename)
+            parts_rows.append(part_row)
+
+        return MergedBuffer(
+            buffer=parts,
+            buffer_rows=parts_rows,
+            remote_key=remote_key,
+        )
+
+    @sentry_sdk.trace
+    def commit_merged_buffer(self, buffer: MergedBuffer) -> None:
+        if buffer.buffer != b"":
+            storage_kv.set(buffer.remote_key, buffer.buffer)
+            # TODO: bulk insert rows
+
+    @sentry_sdk.trace
+    def bulk_track_initial_segment_events(self, items: list[ProcessedRecordingMessage]) -> None:
+        # Each item in the buffer needs to record a billing outcome. Not every segment will bill
+        # but we defer that behavior to the `track_initial_segment_event` function.
+        for item in items:
+            track_initial_segment_event(
+                item.org_id,
+                item.project_id,
+                item.replay_id,
+                item.segment_id,
+                item.key_id,
+                item.received,
+                item.is_replay_video,
+            )
+
+    @sentry_sdk.trace
+    def bulk_emit_action_events(self, items: list[ProcessedRecordingMessage]) -> None:
+        # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The
+        # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed
+        # before we can consider this buffer fully flushed and commit our local offsets.
+        for item in items:
+            if item.actions_event:
+                emit_replay_actions(item.actions_event)
+
+        # Ensure the replay-actions were committed to the broker before we commit this batch.
+        publisher = _initialize_publisher()
+        publisher.flush()
+
+    @sentry_sdk.trace
+    def bulk_track_recording_metadata(items: list[ProcessedRecordingMessage]) -> None:
+        # Each item in the buffer needs to record metrics about itself.
+        for item in items:
+            track_recording_metadata(item)
+
+
+class ThreadedBufferManager:
+    """Threaded buffer manager.
+
+    The threaded buffer manager is the original way we uploaded files except in an application
+    managed thread-pool. We iterate over each processed recording, commit them in a thread, and
+    finally return null when all the tasks complete. It requires no changes elsewhere in the code
+    to accomodate.
+
+    The goal of this class is to _not_ be clever. We want to as closely as possible immitate
+    current production behavior. The reason being is that a consumer refactor is a difficult task
+    to feature flag and we want to gradually rollout the behavior over time after assuring
+    ourselves that there are no defects in the `BatchedBufferManager`.
+    """
+
+    fn = commit_recording_message
+    max_workers = 100
+
+    def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
+        # Use as many workers as necessary up to a limit. We don't want to start thousands of
+        # worker threads.
+        max_workers = min(len(messages), self.max_workers)
+
+        # We apply whatever function is defined on the class to each message in the list. This is
+        # useful for testing reasons (dependency injection).
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            pool.map(self.fn, messages)
+
+        # Recording metadata is not tracked in the threadpool. This is because this function will
+        # log. Logging will acquire a lock and make our threading less useful due to the speed of
+        # the I/O we do in this step.
+        for message in messages:
+            track_recording_metadata(message)
+
+        return None
diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 103864ef99089c..e692156af5ac6e 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,262 +1,93 @@
-from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
-
-# RunTime implementation.
-
-
-def init(flags: dict[str, str]) -> Model["Item"]:
-    buffer = Buffer(flags)
-    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
-
-
-def process_message(message: bytes) -> "Item" | None:
-    return _process_message(message)
-
-
-recording_consumer = buffering_runtime(
-    init_fn=init,
-    process_fn=process_message,
-)
-
-
-# Business-logic implementation.
-import dataclasses
-import logging
 import time
-import uuid
-import zlib
-from typing import cast
 
 import sentry_sdk
 from django.conf import settings
-from sentry_kafka_schemas.codecs import Codec, ValidationError
-from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
-from sentry.conf.types.kafka_definition import Topic, get_topic_codec
-from sentry.replays.lib.storage import (
-    RecordingSegmentStorageMeta,
-    make_recording_filename,
-    make_video_filename,
-    storage_kv,
+from sentry.replays.consumers.buffered.buffer_managers import (
+    BatchedBufferManager,
+    ThreadedBufferManager,
 )
+from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
 from sentry.replays.usecases.ingest import (
-    RecordingIngestMessage,
-    _report_size_metrics,
-    process_headers,
-    recording_post_processor,
-    track_initial_segment_event,
+    DropSilently,
+    ProcessedRecordingMessage,
+    process_recording_message,
 )
-from sentry.replays.usecases.ingest.dom_index import ReplayActionsEvent, _initialize_publisher
-from sentry.utils import json
-
-logger = logging.getLogger(__name__)
-
-RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
-
-
-@dataclasses.dataclass(frozen=True)
-class EventMetadata:
-    is_replay_video: bool
-    key_id: int | None
-    org_id: int
-    project_id: int
-    received: int
-    replay_id: str
-    retention_days: int
-    segment_id: int
-
 
-@dataclasses.dataclass(frozen=True)
-class Item:
-    action_events: list[ReplayActionsEvent]
-    key_name: str
-    metadata: EventMetadata
-    recording: bytes
-    video: bytes | None
 
+class BufferManager:
+    """Buffer manager.
 
-@dataclasses.dataclass(frozen=True)
-class FilePartRow:
-    key: str
-    range_start: int
-    range_stop: int
+    Determines if and when a buffer should flush. The buffer accepts only one argument (`flags`)
+    which can contain any number of configuration options. Currently we only care about the time
+    the buffer has been active and the length of the buffer. But we could update this function to
+    extract an option thats concerned with the bytesize of the buffer if we thought that was a
+    useful metric for committing.
 
-
-@dataclasses.dataclass(frozen=True)
-class MergedBuffer:
-    action_events: list[ReplayActionsEvent]
-    billed_events: list[EventMetadata]
-    buffer: bytes
-    buffer_rows: list[FilePartRow]
-    remote_key: str
-
-
-class Buffer:
+    The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its
+    methods as callbacks to the Model. The state contained within the method's instance is implicit
+    and unknown to the RunTime. We could model this state inside the RunTime but the state is
+    simple enough that I don't feel the need to over-engineer the buffering RunTime. For more
+    complex use-cases you would want to formalize state transformations in the RunTime. Especially
+    if you wanted to expose the state across more locations in the application.
+    """
 
     def __init__(self, flags: dict[str, str]) -> None:
-        self.max_buffer_length = int(flags["max_buffer_length"])
-        self.max_buffer_wait = int(flags["max_buffer_wait"])
-        self.__last_flushed_at = time.time()
-
-    def can_flush(self) -> bool:
-        return (time.time() - self.max_buffer_wait) >= self.__last_flushed_at
-
-    def do_flush(self, model: Model[Item]) -> None:
-        merged_buffer = _merge_buffer(model.buffer)
-        _commit_merged_buffer(merged_buffer)
+        self.__max_buffer_length = int(flags["max_buffer_length"])
+        self.__max_buffer_wait = int(flags["max_buffer_wait"])
         self.__last_flushed_at = time.time()
 
-
-def _merge_buffer(buffer: list[Item]) -> MergedBuffer:
-    action_events = []
-    billed_events: list[EventMetadata] = []
-    parts = b""
-    parts_rows = []
-    remote_key = uuid.uuid4().hex
-
-    for item in buffer:
-        # Extend the action events with whatever exists on the buffer item.
-        action_events.extend(item.action_events)
-
-        # Segment 0 events which are not replay-video are billed.
-        if item.metadata.segment_id == 0 and not item.metadata.is_replay_video:
-            billed_events.append(item.metadata)
-
-        # Append the recording and video if applicable to the zipped file.
-        meta = RecordingSegmentStorageMeta(
-            project_id=item.metadata.project_id,
-            replay_id=item.metadata.replay_id,
-            segment_id=item.metadata.segment_id,
-            retention_days=item.metadata.retention_days,
+    def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
+        return (
+            len(model.buffer) >= self.__max_buffer_length
+            or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at
         )
 
-        parts, part_row = _append_part(parts, item.recording, key=make_recording_filename(meta))
-        parts_rows.append(part_row)
-
-        if item.video:
-            parts, part_row = _append_part(parts, item.video, key=make_video_filename(meta))
-            parts_rows.append(part_row)
-
-    return MergedBuffer(
-        action_events=action_events,
-        billed_events=billed_events,
-        buffer=parts,
-        buffer_rows=parts_rows,
-        remote_key=remote_key,
-    )
-
-
-def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]:
-    range_start = len(parts)
-    range_stop = range_start + len(part) - 1
-    return (parts + part, FilePartRow(key, range_start, range_stop))
-
-
-def _commit_merged_buffer(buffer: MergedBuffer) -> None:
-    # Empty buffer's are not committed.
-    if buffer.buffer == b"":
-        return None
-
-    # Upload recording.
-    storage_kv.set(buffer.remote_key, buffer.buffer)
-
-    # Write rows.
-    # TODO: bulk insert rows
-
-    # Emit billing.
-    for event in buffer.billed_events:
-        track_initial_segment_event(
-            event.org_id,
-            event.project_id,
-            event.replay_id,
-            event.key_id,
-            event.received,
-            event.is_replay_video,
-        )
+    def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
+        # During the transitionary period most organizations will commit following a traditional
+        # commit pattern...
+        threaded = list(filter(lambda i: i.org_id != 1, model.buffer))
+        threaded_buffer = ThreadedBufferManager()
+        threaded_buffer.commit(threaded)
 
-    # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The
-    # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed
-    # before we can consider this buffer fully flushed and commit our local offsets.
-    publisher = _initialize_publisher(asynchronous=True)
-    for event in buffer.action_events:
-        publisher.publish("ingest-replay-events", json.dumps(event))
-    publisher.flush()
+        # ...But others will be opted in to the batched version of uploading. This will give us a
+        # chance to test the feature with as minimal risk as possible.
+        batched = list(filter(lambda i: i.org_id == 1, model.buffer))
+        batched_buffer = BatchedBufferManager()
+        batched_buffer.commit(batched)
 
-    return None
+        # Update the buffer manager with the new time so we don't continuously commit in a loop!
+        self.__last_flushed_at = time.time()
 
 
-def _process_message(message: bytes) -> Item | None:
-    transaction = sentry_sdk.start_transaction(
-        name="replays.consumer.process_recording",
-        op="replays.consumer",
-        custom_sampling_context={
-            "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0)
-        },
-    )
+def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:
+    buffer = BufferManager(flags)
+    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
 
-    with transaction.start_child(
-        op="replays.consumers.buffered.process_message", name="ingest_recording"
-    ):
-        # set_tag("org_id", message.org_id)
-        # set_tag("project_id", message.project_id)
 
-        try:
-            message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message.payload.value)
-        except ValidationError:
-            logger.exception("Could not decode recording message.")
-            return None
+def process_message(message: bytes) -> ProcessedRecordingMessage | None:
+    isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
 
-        message = RecordingIngestMessage(
-            replay_id=message_dict["replay_id"],
-            key_id=message_dict.get("key_id"),
-            org_id=message_dict["org_id"],
-            project_id=message_dict["project_id"],
-            received=message_dict["received"],
-            retention_days=message_dict["retention_days"],
-            payload_with_headers=cast(bytes, message_dict["payload"]),
-            replay_event=cast(bytes | None, message_dict.get("replay_event")),
-            replay_video=cast(bytes | None, message_dict.get("replay_video")),
+    with sentry_sdk.scope.use_isolation_scope(isolation_scope):
+        transaction = sentry_sdk.start_transaction(
+            name="replays.consumer.process_recording",
+            op="replays.consumer",
+            custom_sampling_context={
+                "sample_rate": getattr(
+                    settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0
+                )
+            },
         )
 
         try:
-            headers, compressed_segment = process_headers(message.payload_with_headers)
-        except Exception:
-            logger.exception("Recording headers could not be extracted %s", message.replay_id)
+            return process_recording_message(message)
+        except DropSilently:
             return None
+        finally:
+            transaction.finish()
 
-        # Segment is decompressed for further analysis. Packed format expects
-        # concatenated, uncompressed bytes.
-        try:
-            recording_segment = zlib.decompress(compressed_segment)
-            _report_size_metrics(len(compressed_segment), len(recording_segment))
-        except zlib.error:
-            if compressed_segment[0] == ord("["):
-                recording_segment = compressed_segment
-                compressed_segment = zlib.compress(compressed_segment)  # Save storage $$$
-            else:
-                logger.exception("Invalid recording body.")
-                return None
-
-        recording_post_processor(
-            message, headers, recording_segment, message.replay_event, transaction
-        )
 
-
-@sentry_sdk.trace
-def _decode_recording_message(message: bytes) -> RecordingIngestMessage | None:
-    try:
-        message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message)
-    except ValidationError:
-        logger.exception("Could not decode recording message.")
-        return None
-
-    return RecordingIngestMessage(
-        replay_id=message_dict["replay_id"],
-        key_id=message_dict.get("key_id"),
-        org_id=message_dict["org_id"],
-        project_id=message_dict["project_id"],
-        received=message_dict["received"],
-        retention_days=message_dict["retention_days"],
-        payload_with_headers=cast(bytes, message_dict["payload"]),
-        replay_event=cast(bytes | None, message_dict.get("replay_event")),
-        replay_video=cast(bytes | None, message_dict.get("replay_video")),
-    )
+recording_consumer = buffering_runtime(
+    init_fn=init,
+    process_fn=process_message,
+)
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index c6e52a7b29979b..da21bbe041ebd4 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -16,8 +16,8 @@
 @dataclass
 class Model(Generic[Item]):
     buffer: list[Item]
-    can_flush: Callable[[], bool]
-    do_flush: Callable[[], None]
+    can_flush: Callable[["Model[Item]"], bool]
+    do_flush: Callable[["Model[Item]"], None]
     offsets: MutableMapping[Partition, int]
 
 
diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index ab3ab8901c18b5..250fbbee3860be 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -7,6 +7,7 @@
 from typing import Any, TypedDict, cast
 
 import sentry_sdk
+import sentry_sdk.consts
 import sentry_sdk.scope
 from django.conf import settings
 from sentry_kafka_schemas.codecs import Codec, ValidationError
@@ -186,15 +187,15 @@ def commit_recording_message(recording: ProcessedRecordingMessage) -> None:
     storage_kv.set(recording.filename, recording.filedata)
 
     # Write to billing consumer if its a billable event.
-    if recording.segment_id == 0:
-        track_initial_segment_event(
-            recording.org_id,
-            recording.project_id,
-            recording.replay_id,
-            recording.key_id,
-            recording.received,
-            is_replay_video=recording.is_replay_video,
-        )
+    track_initial_segment_event(
+        recording.org_id,
+        recording.project_id,
+        recording.replay_id,
+        recording.segment_id,
+        recording.key_id,
+        recording.received,
+        is_replay_video=recording.is_replay_video,
+    )
 
     # Write to replay-event consumer.
     if recording.actions_event:
@@ -239,11 +240,32 @@ def track_recording_metadata(recording: ProcessedRecordingMessage) -> None:
         )
 
 
-@sentry_sdk.trace
 def track_initial_segment_event(
     org_id: int,
     project_id: int,
-    replay_id,
+    replay_id: str,
+    segment_id: int,
+    key_id: int | None,
+    received: int,
+    is_replay_video: bool,
+) -> None:
+    # Only segment 0 is billed.
+    if segment_id == 0:
+        _track_initial_segment_event(
+            org_id,
+            project_id,
+            replay_id,
+            key_id,
+            received,
+            is_replay_video,
+        )
+
+
+@sentry_sdk.trace
+def _track_initial_segment_event(
+    org_id: int,
+    project_id: int,
+    replay_id: str,
     key_id: int | None,
     received: int,
     is_replay_video: bool,

From 7a5a98c6d455da761f7c0a4a854fa70864954bcd Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 20 Feb 2025 12:58:49 -0600
Subject: [PATCH 09/49] Write FilePart rows and adopt a new subscription model

---
 src/sentry/options/defaults.py                |  6 ++
 .../consumers/buffered/buffer_managers.py     | 55 ++++++++---
 .../replays/consumers/buffered/consumer.py    | 26 +++--
 src/sentry/replays/consumers/buffered/lib.py  | 61 ++++++++----
 .../replays/consumers/buffered/platform.py    | 94 ++++++++++++++++---
 src/sentry/replays/models.py                  | 15 +++
 6 files changed, 205 insertions(+), 52 deletions(-)

diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
index 4796fa16a0f7ad..ba016021ccf80a 100644
--- a/src/sentry/options/defaults.py
+++ b/src/sentry/options/defaults.py
@@ -456,6 +456,12 @@
     default=None,
     flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
 )
+register(
+    "replay.consumer.use-file-batching",
+    type=Sequence,
+    default=[],
+    flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
+)
 # Globally disables replay-video.
 register(
     "replay.replay-video.disabled",
diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py
index c3504a278107a8..c017b7ebb40628 100644
--- a/src/sentry/replays/consumers/buffered/buffer_managers.py
+++ b/src/sentry/replays/consumers/buffered/buffer_managers.py
@@ -1,18 +1,18 @@
 import dataclasses
 import uuid
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
 
 import sentry_sdk
 
 from sentry.replays.lib.storage import storage_kv
+from sentry.replays.models import FilePart
 from sentry.replays.usecases.ingest import (
     ProcessedRecordingMessage,
     commit_recording_message,
-    emit_replay_actions,
     track_initial_segment_event,
     track_recording_metadata,
 )
-from sentry.replays.usecases.ingest.dom_index import _initialize_publisher
+from sentry.replays.usecases.ingest.dom_index import _initialize_publisher, emit_replay_actions
 
 
 @dataclasses.dataclass(frozen=True)
@@ -26,7 +26,7 @@ class FilePartRow:
 class MergedBuffer:
     buffer: bytes
     buffer_rows: list[FilePartRow]
-    remote_key: str
+    filename: str
 
 
 class BatchedBufferManager:
@@ -37,6 +37,7 @@ class BatchedBufferManager:
     table.
     """
 
+    @sentry_sdk.trace
     def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
         merged_buffer = self.merge_buffer(messages)
         self.commit_merged_buffer(merged_buffer)
@@ -53,7 +54,6 @@ def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRo
 
         parts = b""
         parts_rows = []
-        remote_key = uuid.uuid4().hex
 
         for item in buffer:
             # The recording data is appended to the buffer and a row for tracking is
@@ -64,14 +64,28 @@ def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRo
         return MergedBuffer(
             buffer=parts,
             buffer_rows=parts_rows,
-            remote_key=remote_key,
+            filename=f"90/{uuid.uuid4().hex}",
         )
 
     @sentry_sdk.trace
     def commit_merged_buffer(self, buffer: MergedBuffer) -> None:
-        if buffer.buffer != b"":
-            storage_kv.set(buffer.remote_key, buffer.buffer)
-            # TODO: bulk insert rows
+        if buffer.buffer == b"":
+            return None
+
+        # Storage goes first. Headers go second. If we were to store the headers first we could
+        # expose a record that does not exist to the user which would ultimately 404.
+        storage_kv.set(buffer.filename, buffer.buffer)
+
+        # Bulk insert the filepart.
+        FilePart.objects.bulk_create(
+            FilePart(
+                filename=buffer.filename,
+                key=row.key,
+                range_start=row.range_start,
+                range_stop=row.range_stop,
+            )
+            for row in buffer.buffer_rows
+        )
 
     @sentry_sdk.trace
     def bulk_track_initial_segment_events(self, items: list[ProcessedRecordingMessage]) -> None:
@@ -102,7 +116,7 @@ def bulk_emit_action_events(self, items: list[ProcessedRecordingMessage]) -> Non
         publisher.flush()
 
     @sentry_sdk.trace
-    def bulk_track_recording_metadata(items: list[ProcessedRecordingMessage]) -> None:
+    def bulk_track_recording_metadata(self, items: list[ProcessedRecordingMessage]) -> None:
         # Each item in the buffer needs to record metrics about itself.
         for item in items:
             track_recording_metadata(item)
@@ -122,18 +136,29 @@ class ThreadedBufferManager:
     ourselves that there are no defects in the `BatchedBufferManager`.
     """
 
-    fn = commit_recording_message
-    max_workers = 100
+    def __init__(self) -> None:
+        self.fn = commit_recording_message
+        self.max_workers = 100
 
+    @sentry_sdk.trace
     def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
         # Use as many workers as necessary up to a limit. We don't want to start thousands of
         # worker threads.
         max_workers = min(len(messages), self.max_workers)
 
-        # We apply whatever function is defined on the class to each message in the list. This is
-        # useful for testing reasons (dependency injection).
         with ThreadPoolExecutor(max_workers=max_workers) as pool:
-            pool.map(self.fn, messages)
+            # We apply whatever function is defined on the class to each message in the list. This
+            # is useful for testing reasons (dependency injection).
+            futures = [pool.submit(self.fn, message) for message in messages]
+
+            # Tasks can fail. We check the done set for any failures. We will wait for all the
+            # futures to complete before running this step or eagerly run this step if any task
+            # errors.
+            done, _ = wait(futures, return_when=FIRST_EXCEPTION)
+            for future in done:
+                exc = future.exception()
+                if exc is not None:
+                    raise exc
 
         # Recording metadata is not tracked in the threadpool. This is because this function will
         # log. Logging will acquire a lock and make our threading less useful due to the speed of
diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index e692156af5ac6e..7b2c738fae5fe4 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -3,6 +3,7 @@
 import sentry_sdk
 from django.conf import settings
 
+from sentry import options
 from sentry.replays.consumers.buffered.buffer_managers import (
     BatchedBufferManager,
     ThreadedBufferManager,
@@ -43,16 +44,21 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
             or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at
         )
 
+    @sentry_sdk.trace
     def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
         # During the transitionary period most organizations will commit following a traditional
         # commit pattern...
-        threaded = list(filter(lambda i: i.org_id != 1, model.buffer))
+        #
+        # TODO: Would be nice to cache this value.
+        org_ids = set(options.get("replay.consumer.use-file-batching"))
+
+        threaded = list(filter(lambda i: i.org_id not in org_ids, model.buffer))
         threaded_buffer = ThreadedBufferManager()
         threaded_buffer.commit(threaded)
 
         # ...But others will be opted in to the batched version of uploading. This will give us a
         # chance to test the feature with as minimal risk as possible.
-        batched = list(filter(lambda i: i.org_id == 1, model.buffer))
+        batched = list(filter(lambda i: i.org_id in org_ids, model.buffer))
         batched_buffer = BatchedBufferManager()
         batched_buffer.commit(batched)
 
@@ -60,12 +66,12 @@ def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
         self.__last_flushed_at = time.time()
 
 
-def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:
-    buffer = BufferManager(flags)
-    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
-
-
 def process_message(message: bytes) -> ProcessedRecordingMessage | None:
+    """Message processing function.
+
+    Accepts an unstructured type and returns a structured one. Other than tracing the goal is to
+    have no I/O here. We'll commit the I/O on flush.
+    """
     isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
 
     with sentry_sdk.scope.use_isolation_scope(isolation_scope):
@@ -87,6 +93,12 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None:
             transaction.finish()
 
 
+def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:
+    """Return the initial state of the application."""
+    buffer = BufferManager(flags)
+    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
+
+
 recording_consumer = buffering_runtime(
     init_fn=init,
     process_fn=process_message,
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index da21bbe041ebd4..ff89587a54ce7b 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -7,7 +7,16 @@
 
 from arroyo.types import Partition
 
-from sentry.replays.consumers.buffered.platform import Cmd, Commit, Nothing, RunTime, Task
+from sentry.replays.consumers.buffered.platform import (
+    Cmd,
+    Commit,
+    Join,
+    Nothing,
+    Poll,
+    RunTime,
+    Sub,
+    Task,
+)
 
 Item = TypeVar("Item")
 T = TypeVar("T")
@@ -21,24 +30,36 @@ class Model(Generic[Item]):
     offsets: MutableMapping[Partition, int]
 
 
-class Msg(Generic[Item]):
-    pass
-
-
 @dataclass(frozen=True)
-class Append(Msg[Item]):
+class Append(Generic[Item]):
+    """Append the item to the buffer and update the offsets."""
+
     item: Item
     offset: MutableMapping[Partition, int]
 
 
-class Committed(Msg[Item]):
+class Committed:
+    """The platform committed offsets. Our buffer is now completely done."""
+
     pass
 
 
-class Flush(Msg[Item]):
+class Flush:
+    """Our application hit the flush threshold and has been instructed to flush."""
+
+    pass
+
+
+class Polled:
+    """Our application was polled by the platform."""
+
     pass
 
 
+# A "Msg" is the union of all application messages our RunTime will accept.
+Msg = Append[Item] | Committed | Flush | Polled
+
+
 def process(
     process_fn: Callable[[bytes], Item | None],
     model: Model[Item],
@@ -48,6 +69,8 @@ def process(
     item = process_fn(message)
     if item:
         return Append(item=item, offset=offset)
+    else:
+        return None
 
 
 def init(
@@ -62,7 +85,7 @@ def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Ite
         case Append(item=item, offset=offset):
             model.buffer.append(item)
             model.offsets.update(offset)
-            if model.can_flush():
+            if model.can_flush(model):
                 return (model, Task(msg=Flush()))
             else:
                 return (model, None)
@@ -72,21 +95,23 @@ def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Ite
             # and provide error handling facilities or we could accept that this problem gets too
             # complicated to reasonably abstract and have developers implement their own buffering
             # consumer.
-            model.do_flush()
+            model.do_flush(model)
             model.buffer = []
             return (model, Commit(msg=Committed(), offsets=model.offsets))
         case Committed():
             return (model, None)
-
-    # Satisfy mypy. Apparently we don't do exhaustiveness checks in our configuration.
-    return (model, None)
+        case Polled():
+            if model.can_flush(model):
+                return (model, Task(msg=Flush()))
+            else:
+                return (model, None)
 
 
-def subscription(model: Model[Item]) -> Msg[Item] | None:
-    if model.can_flush():
-        return Flush()
-    else:
-        return None
+def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]:
+    return [
+        Join(msg=Flush()),
+        Poll(msg=Polled()),
+    ]
 
 
 def buffering_runtime(
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index a1c93dd9156199..252377c3f3b2a2 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -52,6 +52,7 @@ def submit(self, message: Message[KafkaPayload]) -> None:
 
     def poll(self) -> None:
         assert not self.__closed
+        self.__runtime.publish("poll")
         self.__commit_step.poll()
 
     def close(self) -> None:
@@ -63,6 +64,8 @@ def terminate(self) -> None:
         self.__commit_step.terminate()
 
     def join(self, timeout: float | None = None) -> None:
+        self.__runtime.publish("join")
+        self.__commit_step.close()
         self.__commit_step.join(timeout)
 
 
@@ -71,37 +74,81 @@ def join(self, timeout: float | None = None) -> None:
 Msg = TypeVar("Msg")
 
 
-class Cmd(Generic[Msg]):
-    pass
+class BackPressure:
+    """Instructs the RunTime to back-pressure the platform.
 
+    Does not accept a `msg` argument as we do not expect the platform to fail.
+    """
 
-@dataclass(frozen=True)
-class BackPressure(Cmd[Msg]):
     pass
 
 
 @dataclass(frozen=True)
-class Commit(Cmd[Msg]):
+class Commit(Generic[Msg]):
+    """Instructs the RunTime to commit the platform.
+
+    Because the RunTime is based on a Kafka platform there is an expectation that the application
+    should trigger commits when necessary. While the application can't be sure its running Kafka
+    it can be sure its running on a Platform that requires offsets be committed.
+    """
+
     msg: Msg
     offsets: MutableMapping[Partition, int]
 
 
-class Nothing(Cmd[Msg]):
+class Nothing:
+    """Instructs the RunTime to do nothing. Equivalent to a null command."""
+
     pass
 
 
 @dataclass(frozen=True)
-class Task(Cmd[Msg]):
+class Task(Generic[Msg]):
+    """Instructs the RunTime to emit an application message back to the application."""
+
     msg: Msg
 
 
+# A "Cmd" is the union of all the commands an application can issue to the RunTime.
+Cmd = BackPressure | Commit[Msg] | Nothing | Task[Msg]
+
+
+@dataclass(frozen=True)
+class Join(Generic[Msg]):
+    """Join subscription class.
+
+    The platform may need to quit. When this happens the RunTime needs to know. The application
+    may or may not need to know. The Join subscription allows aplications to subscribe to join
+    events and handle them in the way they see fit.
+    """
+
+    msg: Msg
+    name = "join"
+
+
+@dataclass(frozen=True)
+class Poll(Generic[Msg]):
+    """Poll subscription class.
+
+    The platform will periodically poll the RunTime. The application may or may not subscribe to
+    these events and choose to act on them.
+    """
+
+    msg: Msg
+    name = "poll"
+
+
+# A "Sub" is the union of all the events an application can subscribe to.
+Sub = Join[Msg] | Poll[Msg]
+
+
 class RunTime(Generic[Model, Msg]):
 
     def __init__(
         self,
         init: Callable[[dict[str, str]], tuple[Model, Cmd[Msg] | None]],
         process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None],
-        subscription: Callable[[Model], Msg | None],
+        subscription: Callable[[Model], list[Sub[Msg]]],
         update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]],
     ) -> None:
         self.init = init
@@ -111,6 +158,7 @@ def __init__(
 
         self.__commit: CommitProtocol | None = None
         self.__model: Model | None = None
+        self.__subscriptions: dict[str, Sub[Msg]] = {}
 
     @property
     def commit(self) -> CommitProtocol:
@@ -122,19 +170,37 @@ def model(self) -> Model:
         assert self.__model is not None
         return self.__model
 
-    def poll(self) -> None:
-        self.__handle_msg(self.subscription(self.model))
-
     def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None:
         self.__commit = commit
 
         model, cmd = self.init(flags)
         self.__model = model
         self.__handle_cmd(cmd)
+        self.__register_subscriptions()
 
     def submit(self, message: Message[KafkaPayload]) -> None:
         self.__handle_msg(self.process(self.model, message.payload.value, message.committable))
 
+    def publish(self, sub_name: str) -> None:
+        # For each new subscription event we re-register the subscribers in case anything within
+        # the application has changed. I.e. the model is in some new state and that means we care
+        # about a new subscription or don't care about an old one.
+        self.__register_subscriptions()
+
+        # Using the subscription's name look for the subscription in the registry.
+        sub = self.__subscriptions.get(sub_name)
+        if sub is None:
+            return None
+
+        # Match of the various subscription types and emit a mesasge to the RunTime. Right now
+        # there's no need to match. The name disambiguates enough already but in the future more
+        # subscriptions might do more complex things.
+        match sub:
+            case Join(msg=msg):
+                return self.__handle_msg(msg)
+            case Poll(msg=msg):
+                return self.__handle_msg(msg)
+
     def __handle_msg(self, msg: Msg | None) -> None:
         if msg:
             model, cmd = self.update(self.model, msg)
@@ -148,7 +214,7 @@ def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
         match cmd:
             case BackPressure():
                 # TODO
-                ...
+                return None
             case Commit(msg=msg, offsets=offsets):
                 self.commit(offsets)
                 return self.__handle_msg(msg)
@@ -156,3 +222,7 @@ def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
                 return None
             case Task(msg=msg):
                 return self.__handle_msg(msg)
+
+    def __register_subscriptions(self) -> None:
+        for sub in self.subscription(self.model):
+            self.__subscriptions[sub.name] = sub
diff --git a/src/sentry/replays/models.py b/src/sentry/replays/models.py
index 1d4099d0155728..b20cb45efe928e 100644
--- a/src/sentry/replays/models.py
+++ b/src/sentry/replays/models.py
@@ -46,3 +46,18 @@ def delete(self, *args, **kwargs):
 
         rv = super().delete(*args, **kwargs)
         return rv
+
+
+@region_silo_model
+class FilePart(Model):
+    __relocation_scope__ = RelocationScope.Excluded
+
+    date_added = models.DateTimeField(default=timezone.now, null=False)
+    filename = models.CharField(null=False)
+    key = models.CharField(db_index=True, null=False)
+    range_start = models.IntegerField(null=False)
+    range_stop = models.IntegerField(null=False)
+
+    class Meta:
+        app_label = "replays"
+        db_table = "replays_filepart"

From 44c899bb70218382a85837002685c7f4bf4b8f12 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 20 Feb 2025 14:52:15 -0600
Subject: [PATCH 10/49] Add unit tests

---
 .../sentry/replays/unit/consumers/__init__.py |   0
 .../unit/consumers/test_buffering_runtime.py  | 108 ++++++++++++++++++
 .../replays/unit/consumers/test_helpers.py    |  27 +++++
 .../replays/unit/consumers/test_recording.py  |  33 ++++++
 .../replays/unit/consumers/test_runtime.py    |  84 ++++++++++++++
 5 files changed, 252 insertions(+)
 create mode 100644 tests/sentry/replays/unit/consumers/__init__.py
 create mode 100644 tests/sentry/replays/unit/consumers/test_buffering_runtime.py
 create mode 100644 tests/sentry/replays/unit/consumers/test_helpers.py
 create mode 100644 tests/sentry/replays/unit/consumers/test_recording.py
 create mode 100644 tests/sentry/replays/unit/consumers/test_runtime.py

diff --git a/tests/sentry/replays/unit/consumers/__init__.py b/tests/sentry/replays/unit/consumers/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
new file mode 100644
index 00000000000000..a0c5f267991604
--- /dev/null
+++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
@@ -0,0 +1,108 @@
+from arroyo.types import Partition, Topic
+
+from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
+from tests.sentry.replays.unit.consumers.helpers import MockCommit, MockSink, make_kafka_message
+
+
+def buffer_runtime(sink):
+    return buffering_runtime(
+        init_fn=lambda _: Model(
+            buffer=[],
+            can_flush=lambda m: len(m.buffer) == 5,
+            do_flush=lambda m: sink.accept(m.buffer),
+            offsets={},
+        ),
+        process_fn=lambda m: int(m) if int(m) < 5 else None,
+    )
+
+
+def test_runtime_setup():
+    runtime = buffer_runtime(MockSink())
+    runtime.setup({}, commit=MockCommit())
+    assert runtime.model.buffer == []
+    assert runtime.model.offsets == {}
+
+
+def test_buffering_runtime_submit():
+    mock_commit = MockCommit()
+    sink = MockSink()
+
+    runtime = buffer_runtime(sink)
+    runtime.setup({}, mock_commit)
+    assert runtime.model.buffer == []
+    assert runtime.model.offsets == {}
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
+
+    # Assert three valid messages buffered.
+    runtime.submit(make_kafka_message(b"1"))
+    runtime.submit(make_kafka_message(b"1"))
+    runtime.submit(make_kafka_message(b"1"))
+    assert runtime.model.buffer == [1, 1, 1]
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
+
+    # Assert two invalid messages not buffered.
+    runtime.submit(make_kafka_message(b"5"))
+    runtime.submit(make_kafka_message(b"5"))
+    assert runtime.model.buffer == [1, 1, 1]
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
+
+    # Assert an additional message is buffered.
+    runtime.submit(make_kafka_message(b"2"))
+    assert runtime.model.buffer == [1, 1, 1, 2]
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
+
+    # Assert the buffer is flushed to the sink.
+    runtime.submit(make_kafka_message(b"3"))
+    assert runtime.model.buffer == []
+    assert sink.accepted == [1, 1, 1, 2, 3]
+    assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
+
+
+def test_buffering_runtime_publish():
+    mock_commit = MockCommit()
+    sink = MockSink()
+
+    runtime = buffer_runtime(sink)
+    runtime.setup({}, mock_commit)
+    assert runtime.model.buffer == []
+    assert runtime.model.offsets == {}
+    assert sink.accepted == []
+
+    # Assert three valid messages buffered.
+    runtime.submit(make_kafka_message(b"1"))
+    runtime.submit(make_kafka_message(b"1"))
+    runtime.submit(make_kafka_message(b"1"))
+    assert runtime.model.buffer == [1, 1, 1]
+    assert sink.accepted == []
+
+    # Assert join subscriptions eagerly flush the buffer.
+    runtime.publish("join")
+    assert runtime.model.buffer == []
+    assert sink.accepted == [1, 1, 1]
+    assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
+
+    # Reset the mocks.
+    mock_commit.commit = {}
+    sink.accepted = []
+
+    # Assert poll does not automatically empty the buffer.
+    runtime.submit(make_kafka_message(b"1"))
+    runtime.publish("poll")
+    assert runtime.model.buffer == [1]
+    assert sink.accepted == []
+
+    # Dirty mutation warning! We're forcing ourselves to be in a flushable state by mutating the
+    # state outside the explicit state machine interfaces.
+    runtime.model.buffer = [1, 2, 3, 4, 5]
+    assert runtime.model.buffer == [1, 2, 3, 4, 5]
+    assert sink.accepted == []
+
+    # Assert poll flushes if the buffer is ready.
+    runtime.publish("poll")
+    assert runtime.model.buffer == []
+    assert sink.accepted == [1, 2, 3, 4, 5]
+    assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py
new file mode 100644
index 00000000000000..1f4eecbc060674
--- /dev/null
+++ b/tests/sentry/replays/unit/consumers/test_helpers.py
@@ -0,0 +1,27 @@
+from collections.abc import Mapping
+from datetime import datetime
+
+from arroyo.backends.kafka import KafkaPayload
+from arroyo.types import BrokerValue, Message, Partition, Topic
+
+
+class MockCommit:
+    def __init__(self):
+        self.commit = {}
+
+    def __call__(self, offsets: Mapping[Partition, int], force: bool = False) -> None:
+        self.commit.update(offsets)
+
+
+class MockSink:
+    def __init__(self):
+        self.accepted = []
+
+    def accept(self, buffer):
+        self.accepted.extend(buffer)
+
+
+def make_kafka_message(message: bytes):
+    return Message(
+        BrokerValue(KafkaPayload(b"k", message, []), Partition(Topic("a"), 1), 1, datetime.now())
+    )
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
new file mode 100644
index 00000000000000..68f39fa70bf917
--- /dev/null
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -0,0 +1,33 @@
+import time
+import uuid
+
+import msgpack
+
+from sentry.replays.consumers.buffered.consumer import process_message
+
+# from sentry.replays.usecases.ingest import ProcessedRecordingMessage
+
+
+def test_process_message_invalid():
+    result = process_message(msgpack.packb(b"hello, world!"))
+    assert result is None
+
+
+def test_process_message():
+    result = process_message(
+        msgpack.packb(
+            {
+                "type": "replay_recording_not_chunked",
+                "replay_id": uuid.uuid4().hex,
+                "org_id": 1,
+                "key_id": 3,
+                "project_id": 2,
+                "received": int(time.time()),
+                "retention_days": 30,
+                "payload": b'{"segment_id":0}\n[]',
+                "replay_event": None,
+                "replay_video": None,
+            }
+        )
+    )
+    assert result is None
diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py
new file mode 100644
index 00000000000000..07c26ae0e4e7f7
--- /dev/null
+++ b/tests/sentry/replays/unit/consumers/test_runtime.py
@@ -0,0 +1,84 @@
+from sentry.replays.consumers.buffered.platform import Join, Poll, RunTime
+from tests.sentry.replays.unit.consumers.helpers import MockCommit, make_kafka_message
+
+
+def counter_runtime() -> RunTime[int, str]:
+    def init(_):
+        return (22, None)
+
+    def process(_model, message, _offsets):
+        if message == b"incr":
+            return "incr"
+        elif message == b"decr":
+            return "decr"
+        else:
+            return None
+
+    def update(model, msg):
+        if msg == "incr":
+            return (model + 1, None)
+        elif msg == "decr":
+            return (model - 1, None)
+        elif msg == "join":
+            return (-10, None)
+        elif msg == "poll":
+            return (99, None)
+        else:
+            raise ValueError("Unknown msg")
+
+    def subscription(_):
+        return [
+            Join(msg="join"),
+            Poll(msg="poll"),
+        ]
+
+    return RunTime(
+        init=init,
+        process=process,
+        update=update,
+        subscription=subscription,
+    )
+
+
+def test_runtime_setup():
+    runtime = counter_runtime()
+    runtime.setup({}, commit=MockCommit())
+    assert runtime.model == 22
+
+
+def test_runtime_submit():
+    # RunTime defaults to a start point of 22.
+    runtime = counter_runtime()
+    runtime.setup({}, commit=MockCommit())
+    assert runtime.model == 22
+
+    # Two incr commands increase the count by 2.
+    runtime.submit(make_kafka_message(b"incr"))
+    runtime.submit(make_kafka_message(b"incr"))
+    assert runtime.model == 24
+
+    # Four decr commands decrease the count by 4.
+    runtime.submit(make_kafka_message(b"decr"))
+    runtime.submit(make_kafka_message(b"decr"))
+    runtime.submit(make_kafka_message(b"decr"))
+    runtime.submit(make_kafka_message(b"decr"))
+    assert runtime.model == 20
+
+    # Messages which the application does not understand do nothing to the model.
+    runtime.submit(make_kafka_message(b"other"))
+    assert runtime.model == 20
+
+
+def test_runtime_publish():
+    # RunTime defaults to a start point of 22.
+    runtime = counter_runtime()
+    runtime.setup({}, commit=MockCommit())
+    assert runtime.model == 22
+
+    # A join event updates the model and sets it to -10.
+    runtime.publish("join")
+    assert runtime.model == -10
+
+    # A poll event updates the model and sets it to 99.
+    runtime.publish("poll")
+    assert runtime.model == 99

From de76ad0c585518df38599bd6b583e94c628f920b Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 20 Feb 2025 14:52:49 -0600
Subject: [PATCH 11/49] Add contextual errors

---
 .../replays/consumers/buffered/consumer.py     |  2 +-
 src/sentry/replays/usecases/ingest/__init__.py | 18 +++++++++++++++---
 .../unit/consumers/test_buffering_runtime.py   |  6 +++++-
 .../replays/unit/consumers/test_runtime.py     |  2 +-
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 7b2c738fae5fe4..d8709b4ef354a5 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -99,7 +99,7 @@ def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:
     return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
 
 
-recording_consumer = buffering_runtime(
+recording_runtime = buffering_runtime(
     init_fn=init,
     process_fn=process_message,
 )
diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index 250fbbee3860be..03dc3b565aa617 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -47,6 +47,18 @@ class DropSilently(Exception):
     pass
 
 
+class CouldNotDecode(DropSilently):
+    pass
+
+
+class CouldNotParseHeaders(DropSilently):
+    pass
+
+
+class CouldNotDecompress(DropSilently):
+    pass
+
+
 class ReplayRecordingSegment(TypedDict):
     id: str  # a uuid that individualy identifies a recording segment
     chunks: int  # the number of chunks for this segment
@@ -336,7 +348,7 @@ def parse_recording_message(message: bytes) -> RecordingIngestMessage:
         message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message)
     except ValidationError:
         logger.exception("Could not decode recording message.")
-        raise DropSilently()
+        raise CouldNotDecode()
 
     return RecordingIngestMessage(
         replay_id=message_dict["replay_id"],
@@ -360,7 +372,7 @@ def parse_headers(recording: bytes, replay_id: str) -> tuple[RecordingSegmentHea
         return recording_headers, recording_segment
     except Exception:
         logger.exception("Recording headers could not be extracted %s", replay_id)
-        raise DropSilently()
+        raise CouldNotParseHeaders()
 
 
 @sentry_sdk.trace
@@ -374,7 +386,7 @@ def decompress_segment(segment: bytes) -> tuple[bytes, bytes]:
             return compressed_segment, segment
         else:
             logger.exception("Invalid recording body.")
-            raise DropSilently()
+            raise CouldNotDecompress()
 
 
 @sentry_sdk.trace
diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
index a0c5f267991604..3ee061e764f3bb 100644
--- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
@@ -1,7 +1,11 @@
 from arroyo.types import Partition, Topic
 
 from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
-from tests.sentry.replays.unit.consumers.helpers import MockCommit, MockSink, make_kafka_message
+from tests.sentry.replays.unit.consumers.test_helpers import (
+    MockCommit,
+    MockSink,
+    make_kafka_message,
+)
 
 
 def buffer_runtime(sink):
diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py
index 07c26ae0e4e7f7..179d5cf5760471 100644
--- a/tests/sentry/replays/unit/consumers/test_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_runtime.py
@@ -1,5 +1,5 @@
 from sentry.replays.consumers.buffered.platform import Join, Poll, RunTime
-from tests.sentry.replays.unit.consumers.helpers import MockCommit, make_kafka_message
+from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
 
 
 def counter_runtime() -> RunTime[int, str]:

From abf22cf92f91b1dba1578e872239284e9491dbc2 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 20 Feb 2025 19:29:31 -0600
Subject: [PATCH 12/49] Misc test updates

---
 .../replays/unit/consumers/test_recording.py  | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 68f39fa70bf917..22bb3701c74f7a 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -4,8 +4,7 @@
 import msgpack
 
 from sentry.replays.consumers.buffered.consumer import process_message
-
-# from sentry.replays.usecases.ingest import ProcessedRecordingMessage
+from sentry.replays.usecases.ingest import ProcessedRecordingMessage
 
 
 def test_process_message_invalid():
@@ -30,4 +29,19 @@ def test_process_message():
             }
         )
     )
-    assert result is None
+    assert result is not None
+    assert result == ProcessedRecordingMessage(
+        actions_event=[],
+        filedata=b"[]",
+        filename=result.filename,
+        is_replay_video=False,
+        key_id=3,
+        org_id=1,
+        project_id=2,
+        received=result.received,
+        recording_size_uncompressed=2,
+        recording_size=result.recording_size,
+        replay_id=result.replay_id,
+        segment_id=0,
+        video_size=None,
+    )

From 2e16a0111b451e0439b8b4e73bdf847cd4504909 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 24 Feb 2025 13:51:39 -0600
Subject: [PATCH 13/49] Fully separate compute and io within the recording
 consumer

---
 .../replays/usecases/ingest/__init__.py       | 203 +++++++++++++++++-
 1 file changed, 196 insertions(+), 7 deletions(-)

diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index 28180ce7118b75..d975120052f187 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -89,7 +89,7 @@ class RecordingIngestMessage:
     replay_video: bytes | None
 
 
-def ingest_recording(message: bytes) -> None:
+def ingest_recording(message_bytes: bytes) -> None:
     """Ingest non-chunked recording messages."""
     isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
 
@@ -105,7 +105,11 @@ def ingest_recording(message: bytes) -> None:
         )
 
         try:
-            _ingest_recording(message)
+            message = parse_recording_message(message_bytes)
+            if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"):
+                _ingest_recording_separated_io_compute(message)
+            else:
+                _ingest_recording(message)
         except DropSilently:
             # The message couldn't be parsed for whatever reason. We shouldn't block the consumer
             # so we ignore it.
@@ -115,10 +119,8 @@ def ingest_recording(message: bytes) -> None:
 
 
 @sentry_sdk.trace
-def _ingest_recording(message_bytes: bytes) -> None:
+def _ingest_recording(message: RecordingIngestMessage) -> None:
     """Ingest recording messages."""
-    message = parse_recording_message(message_bytes)
-
     set_tag("org_id", message.org_id)
     set_tag("project_id", message.project_id)
 
@@ -190,6 +192,9 @@ def track_initial_segment_event(
 ) -> None:
     try:
         project = Project.objects.get_from_cache(id=project_id)
+        return _track_initial_segment_event(
+            org_id, project, replay_id, key_id, received, is_replay_video
+        )
     except Project.DoesNotExist:
         logger.warning(
             "Recording segment was received for a project that does not exist.",
@@ -200,6 +205,15 @@ def track_initial_segment_event(
         )
         return None
 
+
+def _track_initial_segment_event(
+    org_id: int,
+    project: Project,
+    replay_id,
+    key_id: int | None,
+    received: int,
+    is_replay_video: bool,
+) -> None:
     if not project.flags.has_replays:
         first_replay_received.send_robust(project=project, sender=Project)
 
@@ -208,7 +222,7 @@ def track_initial_segment_event(
         metrics.incr("replays.billing-outcome-skipped")
         track_outcome(
             org_id=org_id,
-            project_id=project_id,
+            project_id=project.id,
             key_id=key_id,
             outcome=Outcome.ACCEPTED,
             reason=None,
@@ -220,7 +234,7 @@ def track_initial_segment_event(
     else:
         track_outcome(
             org_id=org_id,
-            project_id=project_id,
+            project_id=project.id,
             key_id=key_id,
             outcome=Outcome.ACCEPTED,
             reason=None,
@@ -400,3 +414,178 @@ def emit_replay_events(
     log_option_events(event_meta, project.id, replay_id)
     report_hydration_error(event_meta, project, replay_id, replay_event)
     report_rage_click(event_meta, project, replay_id, replay_event)
+
+
+# Separated I/O and compute branch.
+
+
+class CouldNotFindProject(DropSilently):
+    pass
+
+
+@sentry_sdk.trace
+def _ingest_recording_separated_io_compute(message: RecordingIngestMessage) -> None:
+    """Ingest recording messages."""
+    processed_recording = process_recording_message(message)
+    commit_recording_message(processed_recording)
+    track_recording_metadata(processed_recording)
+
+
+@dataclasses.dataclass
+class ProcessedRecordingMessage:
+    actions_event: ParsedEventMeta | None
+    filedata: bytes
+    filename: str
+    is_replay_video: bool
+    key_id: int | None
+    org_id: int
+    project_id: int
+    received: int
+    recording_size_uncompressed: int
+    recording_size: int
+    replay_event: dict[str, Any] | None
+    replay_id: str
+    segment_id: int
+    video_size: int | None
+
+
+@sentry_sdk.trace
+def process_recording_message(message: RecordingIngestMessage) -> ProcessedRecordingMessage:
+    set_tag("org_id", message.org_id)
+    set_tag("project_id", message.project_id)
+
+    headers, segment_bytes = parse_headers(message.payload_with_headers, message.replay_id)
+    segment = decompress_segment(segment_bytes)
+
+    replay_events = parse_replay_events(message, headers, segment.decompressed)
+
+    with sentry_sdk.start_span(name="Parse replay event"):
+        replay_event = json.loads(message.replay_event) if message.replay_event else None
+
+    filename = make_recording_filename(
+        RecordingSegmentStorageMeta(
+            project_id=message.project_id,
+            replay_id=message.replay_id,
+            segment_id=headers["segment_id"],
+            retention_days=message.retention_days,
+        )
+    )
+
+    if message.replay_video:
+        with sentry_sdk.start_span(name="Compress video event"):
+            filedata = zlib.compress(pack(rrweb=segment.decompressed, video=message.replay_video))
+        video_size = len(message.replay_video)
+    else:
+        filedata = segment.compressed
+        video_size = None
+
+    return ProcessedRecordingMessage(
+        replay_events,
+        filedata,
+        filename,
+        is_replay_video=message.replay_video is not None,
+        key_id=message.key_id,
+        org_id=message.org_id,
+        project_id=message.project_id,
+        received=message.received,
+        recording_size_uncompressed=len(segment.decompressed),
+        recording_size=len(segment.compressed),
+        replay_event=replay_event,
+        replay_id=message.replay_id,
+        segment_id=headers["segment_id"],
+        video_size=video_size,
+    )
+
+
+@sentry_sdk.trace
+def commit_recording_message(recording: ProcessedRecordingMessage) -> None:
+    # Write to GCS.
+    storage_kv.set(recording.filename, recording.filedata)
+
+    try:
+        project = Project.objects.get_from_cache(id=recording.project_id)
+        assert isinstance(project, Project)
+    except Project.DoesNotExist:
+        logger.warning(
+            "Recording segment was received for a project that does not exist.",
+            extra={
+                "project_id": recording.project_id,
+                "replay_id": recording.replay_id,
+            },
+        )
+        raise CouldNotFindProject()
+
+    # Write to billing consumer if its a billable event.
+    if recording.segment_id == 0:
+        _track_initial_segment_event(
+            recording.org_id,
+            project,
+            recording.replay_id,
+            recording.key_id,
+            recording.received,
+            is_replay_video=recording.is_replay_video,
+        )
+
+    # Write to replay-event consumer.
+    if recording.actions_event:
+        emit_replay_events(
+            recording.actions_event,
+            recording.org_id,
+            project,
+            recording.replay_id,
+            recording.replay_event,
+        )
+
+
+@sentry_sdk.trace
+def track_recording_metadata(recording: ProcessedRecordingMessage) -> None:
+    # Report size metrics to determine usage patterns.
+    _report_size_metrics(
+        size_compressed=recording.recording_size,
+        size_uncompressed=recording.recording_size_uncompressed,
+    )
+
+    if recording.video_size:
+        # Logging org info for bigquery
+        logger.info(
+            "sentry.replays.slow_click",
+            extra={
+                "event_type": "mobile_event",
+                "org_id": recording.org_id,
+                "project_id": recording.project_id,
+                "size": len(recording.filedata),
+            },
+        )
+
+        # Track the number of replay-video events we receive.
+        metrics.incr("replays.recording_consumer.replay_video_count")
+
+        # Record video size for COGS analysis.
+        metrics.distribution(
+            "replays.recording_consumer.replay_video_size",
+            recording.video_size,
+            unit="byte",
+        )
+
+        # Track combined payload size for COGs analysis.
+        metrics.distribution(
+            "replays.recording_consumer.replay_video_event_size",
+            len(recording.filedata),
+            unit="byte",
+        )
+
+
+def parse_replay_events(
+    message: RecordingIngestMessage, headers: RecordingSegmentHeaders, segment_bytes: bytes
+) -> ParsedEventMeta | None:
+    try:
+        return parse_events(json.loads(segment_bytes))
+    except Exception:
+        logging.exception(
+            "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s",
+            message.org_id,
+            message.project_id,
+            message.replay_id,
+            headers["segment_id"],
+        )
+        return None

From 208360fc0696951eed2f26388e4a9240e518ba17 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 24 Feb 2025 19:23:33 -0600
Subject: [PATCH 14/49] Configure max workers

---
 src/sentry/replays/consumers/buffered/buffer_managers.py | 4 ++--
 src/sentry/replays/consumers/buffered/consumer.py        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py
index c017b7ebb40628..4cc4ecf8037040 100644
--- a/src/sentry/replays/consumers/buffered/buffer_managers.py
+++ b/src/sentry/replays/consumers/buffered/buffer_managers.py
@@ -136,9 +136,9 @@ class ThreadedBufferManager:
     ourselves that there are no defects in the `BatchedBufferManager`.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, max_workers: int) -> None:
         self.fn = commit_recording_message
-        self.max_workers = 100
+        self.max_workers = max_workers
 
     @sentry_sdk.trace
     def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index d8709b4ef354a5..89f2a15408aa9c 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -53,7 +53,7 @@ def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
         org_ids = set(options.get("replay.consumer.use-file-batching"))
 
         threaded = list(filter(lambda i: i.org_id not in org_ids, model.buffer))
-        threaded_buffer = ThreadedBufferManager()
+        threaded_buffer = ThreadedBufferManager(max_workers=32)
         threaded_buffer.commit(threaded)
 
         # ...But others will be opted in to the batched version of uploading. This will give us a

From fc8df9cd1858751e156c6ee15f499f286b6bb31d Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 24 Feb 2025 19:49:04 -0600
Subject: [PATCH 15/49] Remove conditional branch as its moved further up the
 hierarchy

---
 .../replays/usecases/ingest/__init__.py       | 59 ++++++++-----------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index 0b9ae88ba6c9bb..8d362e9a20b509 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -273,42 +273,29 @@ def recording_post_processor(
     try:
         segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes)
 
-        # Conditionally use the new separated event parsing and logging logic. This way we can
-        # feature flag access and fix any issues we find.
-        if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"):
-            event_meta = parse_events(segment)
-            project = Project.objects.get_from_cache(id=message.project_id)
-            emit_replay_events(
-                event_meta,
-                message.org_id,
-                project,
-                message.replay_id,
-                replay_event,
-            )
-        else:
-            actions_event = try_get_replay_actions(message, segment, replay_event)
-            if actions_event:
-                emit_replay_actions(actions_event)
-
-            # Log canvas mutations to bigquery.
-            log_canvas_size_old(
-                message.org_id,
-                message.project_id,
-                message.replay_id,
-                segment,
-            )
-
-            # Log # of rrweb events to bigquery.
-            logger.info(
-                "sentry.replays.slow_click",
-                extra={
-                    "event_type": "rrweb_event_count",
-                    "org_id": message.org_id,
-                    "project_id": message.project_id,
-                    "replay_id": message.replay_id,
-                    "size": len(segment),
-                },
-            )
+        actions_event = try_get_replay_actions(message, segment, replay_event)
+        if actions_event:
+            emit_replay_actions(actions_event)
+
+        # Log canvas mutations to bigquery.
+        log_canvas_size_old(
+            message.org_id,
+            message.project_id,
+            message.replay_id,
+            segment,
+        )
+
+        # Log # of rrweb events to bigquery.
+        logger.info(
+            "sentry.replays.slow_click",
+            extra={
+                "event_type": "rrweb_event_count",
+                "org_id": message.org_id,
+                "project_id": message.project_id,
+                "replay_id": message.replay_id,
+                "size": len(segment),
+            },
+        )
     except Exception:
         logging.exception(
             "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s",

From 536c250e6f15856b1a8e8a9f51141655d1a3df92 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 24 Feb 2025 20:00:11 -0600
Subject: [PATCH 16/49] Use context manager

---
 .../replays/consumers/buffered/consumer.py       | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 89f2a15408aa9c..a4ea7138505236 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -73,9 +73,8 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None:
     have no I/O here. We'll commit the I/O on flush.
     """
     isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
-
     with sentry_sdk.scope.use_isolation_scope(isolation_scope):
-        transaction = sentry_sdk.start_transaction(
+        with sentry_sdk.start_transaction(
             name="replays.consumer.process_recording",
             op="replays.consumer",
             custom_sampling_context={
@@ -83,14 +82,11 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None:
                     settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0
                 )
             },
-        )
-
-        try:
-            return process_recording_message(message)
-        except DropSilently:
-            return None
-        finally:
-            transaction.finish()
+        ):
+            try:
+                return process_recording_message(message)
+            except DropSilently:
+                return None
 
 
 def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:

From ea571fa3f682803338205fea052f83fcab248c1c Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 24 Feb 2025 20:26:32 -0600
Subject: [PATCH 17/49] Simplify buffer flushing (for now)

---
 .../replays/consumers/buffered/consumer.py    | 53 +++++++++++--------
 .../replays/usecases/ingest/__init__.py       |  1 -
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index a4ea7138505236..b69e83893f5c57 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,18 +1,16 @@
 import time
+from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
 
 import sentry_sdk
 from django.conf import settings
 
-from sentry import options
-from sentry.replays.consumers.buffered.buffer_managers import (
-    BatchedBufferManager,
-    ThreadedBufferManager,
-)
 from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
 from sentry.replays.usecases.ingest import (
     DropSilently,
     ProcessedRecordingMessage,
+    commit_recording_message,
     process_recording_message,
+    track_recording_metadata,
 )
 
 
@@ -46,26 +44,39 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
 
     @sentry_sdk.trace
     def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
-        # During the transitionary period most organizations will commit following a traditional
-        # commit pattern...
-        #
-        # TODO: Would be nice to cache this value.
-        org_ids = set(options.get("replay.consumer.use-file-batching"))
-
-        threaded = list(filter(lambda i: i.org_id not in org_ids, model.buffer))
-        threaded_buffer = ThreadedBufferManager(max_workers=32)
-        threaded_buffer.commit(threaded)
-
-        # ...But others will be opted in to the batched version of uploading. This will give us a
-        # chance to test the feature with as minimal risk as possible.
-        batched = list(filter(lambda i: i.org_id in org_ids, model.buffer))
-        batched_buffer = BatchedBufferManager()
-        batched_buffer.commit(batched)
-
+        flush_buffer(model, max_workers=32)
         # Update the buffer manager with the new time so we don't continuously commit in a loop!
         self.__last_flushed_at = time.time()
 
 
+def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32) -> None:
+    # Use as many workers as necessary up to a limit. We don't want to start thousands of
+    # worker threads.
+    max_workers = min(len(model.buffer), max_workers)
+
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        # We apply whatever function is defined on the class to each message in the list. This
+        # is useful for testing reasons (dependency injection).
+        futures = [pool.submit(commit_recording_message, message) for message in model.buffer]
+
+        # Tasks can fail. We check the done set for any failures. We will wait for all the
+        # futures to complete before running this step or eagerly run this step if any task
+        # errors.
+        done, _ = wait(futures, return_when=FIRST_EXCEPTION)
+        for future in done:
+            exc = future.exception()
+            if exc is not None:
+                raise exc
+
+    # Recording metadata is not tracked in the threadpool. This is because this function will
+    # log. Logging will acquire a lock and make our threading less useful due to the speed of
+    # the I/O we do in this step.
+    for message in model.buffer:
+        track_recording_metadata(message)
+
+    return None
+
+
 def process_message(message: bytes) -> ProcessedRecordingMessage | None:
     """Message processing function.
 
diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index 8d362e9a20b509..05a4ddec7a267b 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -483,7 +483,6 @@ def process_recording_message(message: RecordingIngestMessage) -> ProcessedRecor
 
 @sentry_sdk.trace
 def commit_recording_message(recording: ProcessedRecordingMessage) -> None:
-    # Write to GCS.
     storage_kv.set(recording.filename, recording.filedata)
 
     try:

From 373388636b6359ee3249d40981d4136744a4333e Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 08:31:02 -0600
Subject: [PATCH 18/49] Update tracing logic

---
 .../replays/consumers/buffered/consumer.py    | 42 +++++++++----------
 .../replays/usecases/ingest/__init__.py       | 41 +++++++++---------
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index b69e83893f5c57..f6df66284447f8 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,8 +1,8 @@
+import contextlib
 import time
 from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
 
 import sentry_sdk
-from django.conf import settings
 
 from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
 from sentry.replays.usecases.ingest import (
@@ -10,6 +10,7 @@
     ProcessedRecordingMessage,
     commit_recording_message,
     process_recording_message,
+    sentry_tracing,
     track_recording_metadata,
 )
 
@@ -42,14 +43,16 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
             or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at
         )
 
-    @sentry_sdk.trace
     def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
-        flush_buffer(model, max_workers=32)
-        # Update the buffer manager with the new time so we don't continuously commit in a loop!
-        self.__last_flushed_at = time.time()
+        with sentry_tracing("replays.consumers.buffered.flush_buffer"):
+            flush_buffer(model, max_workers=8)
+            # Update the buffer manager with the new time so we don't continuously commit in a
+            # loop!
+            self.__last_flushed_at = time.time()
 
 
-def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32) -> None:
+@sentry_sdk.trace
+def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None:
     # Use as many workers as necessary up to a limit. We don't want to start thousands of
     # worker threads.
     max_workers = min(len(model.buffer), max_workers)
@@ -57,7 +60,7 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32)
     with ThreadPoolExecutor(max_workers=max_workers) as pool:
         # We apply whatever function is defined on the class to each message in the list. This
         # is useful for testing reasons (dependency injection).
-        futures = [pool.submit(commit_recording_message, message) for message in model.buffer]
+        futures = [pool.submit(flush_message, message) for message in model.buffer]
 
         # Tasks can fail. We check the done set for any failures. We will wait for all the
         # futures to complete before running this step or eagerly run this step if any task
@@ -77,27 +80,22 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32)
     return None
 
 
+@sentry_sdk.trace
+def flush_message(message: ProcessedRecordingMessage) -> None:
+    """Message flushing function."""
+    with contextlib.suppress(DropSilently):
+        commit_recording_message(message)
+
+
 def process_message(message: bytes) -> ProcessedRecordingMessage | None:
     """Message processing function.
 
     Accepts an unstructured type and returns a structured one. Other than tracing the goal is to
     have no I/O here. We'll commit the I/O on flush.
     """
-    isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
-    with sentry_sdk.scope.use_isolation_scope(isolation_scope):
-        with sentry_sdk.start_transaction(
-            name="replays.consumer.process_recording",
-            op="replays.consumer",
-            custom_sampling_context={
-                "sample_rate": getattr(
-                    settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0
-                )
-            },
-        ):
-            try:
-                return process_recording_message(message)
-            except DropSilently:
-                return None
+    with sentry_tracing("replays.consumers.buffered.process_message"):
+        with contextlib.suppress(DropSilently):
+            return process_recording_message(message)
 
 
 def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:
diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py
index ea5c479318dde3..104e9b5a44cc5c 100644
--- a/src/sentry/replays/usecases/ingest/__init__.py
+++ b/src/sentry/replays/usecases/ingest/__init__.py
@@ -4,6 +4,7 @@
 import logging
 import time
 import zlib
+from contextlib import contextmanager
 from datetime import datetime, timezone
 from typing import Any, TypedDict, cast
 
@@ -91,30 +92,30 @@ class RecordingIngestMessage:
     replay_video: bytes | None
 
 
-def ingest_recording(message_bytes: bytes) -> None:
-    """Ingest non-chunked recording messages."""
+@contextmanager
+def sentry_tracing(name: str):
+    sample_rate = getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0)
     isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork()
-
     with sentry_sdk.scope.use_isolation_scope(isolation_scope):
         with sentry_sdk.start_transaction(
-            name="replays.consumer.process_recording",
-            op="replays.consumer",
-            custom_sampling_context={
-                "sample_rate": getattr(
-                    settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0
-                )
-            },
+            name=name, op="replays.consumer", custom_sampling_context={"sample_rate": sample_rate}
         ):
-            try:
-                message = parse_recording_message(message_bytes)
-                if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"):
-                    _ingest_recording_separated_io_compute(message)
-                else:
-                    _ingest_recording(message)
-            except DropSilently:
-                # The message couldn't be parsed for whatever reason. We shouldn't block the consumer
-                # so we ignore it.
-                pass
+            yield
+
+
+def ingest_recording(message_bytes: bytes) -> None:
+    """Ingest non-chunked recording messages."""
+    with sentry_tracing("replays.consumer.process_recording"):
+        try:
+            message = parse_recording_message(message_bytes)
+            if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"):
+                _ingest_recording_separated_io_compute(message)
+            else:
+                _ingest_recording(message)
+        except DropSilently:
+            # The message couldn't be parsed for whatever reason. We shouldn't block the consumer
+            # so we ignore it.
+            pass
 
 
 @sentry_sdk.trace

From 88747c123d9129bd174ca025e864879c648f23a6 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:06:15 -0600
Subject: [PATCH 19/49] Soften flag requirements and minor fixes

---
 .../replays/consumers/buffered/consumer.py     | 18 +++++++++++-------
 .../sentry/replays/consumers/test_recording.py | 12 ++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index f6df66284447f8..9a4fa0339de0ed 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -9,6 +9,7 @@
     DropSilently,
     ProcessedRecordingMessage,
     commit_recording_message,
+    parse_recording_message,
     process_recording_message,
     sentry_tracing,
     track_recording_metadata,
@@ -33,8 +34,11 @@ class BufferManager:
     """
 
     def __init__(self, flags: dict[str, str]) -> None:
-        self.__max_buffer_length = int(flags["max_buffer_length"])
-        self.__max_buffer_wait = int(flags["max_buffer_wait"])
+        # Flags are safely extracted and default arguments are used.
+        self.__max_buffer_length = int(flags.get("max_buffer_length", 8))
+        self.__max_buffer_wait = int(flags.get("max_buffer_wait", 1))
+        self.__max_workers = int(flags.get("max_workers", 8))
+
         self.__last_flushed_at = time.time()
 
     def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
@@ -45,7 +49,7 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
 
     def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
         with sentry_tracing("replays.consumers.buffered.flush_buffer"):
-            flush_buffer(model, max_workers=8)
+            flush_buffer(model, max_workers=self.__max_workers)
             # Update the buffer manager with the new time so we don't continuously commit in a
             # loop!
             self.__last_flushed_at = time.time()
@@ -53,9 +57,8 @@ def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
 
 @sentry_sdk.trace
 def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None:
-    # Use as many workers as necessary up to a limit. We don't want to start thousands of
-    # worker threads.
-    max_workers = min(len(model.buffer), max_workers)
+    if len(model.buffer) == 0:
+        return None
 
     with ThreadPoolExecutor(max_workers=max_workers) as pool:
         # We apply whatever function is defined on the class to each message in the list. This
@@ -87,7 +90,7 @@ def flush_message(message: ProcessedRecordingMessage) -> None:
         commit_recording_message(message)
 
 
-def process_message(message: bytes) -> ProcessedRecordingMessage | None:
+def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None:
     """Message processing function.
 
     Accepts an unstructured type and returns a structured one. Other than tracing the goal is to
@@ -95,6 +98,7 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None:
     """
     with sentry_tracing("replays.consumers.buffered.process_message"):
         with contextlib.suppress(DropSilently):
+            message = parse_recording_message(message_bytes)
             return process_recording_message(message)
 
 
diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py
index e2958e54dd7a88..90f551113adedf 100644
--- a/tests/sentry/replays/consumers/test_recording.py
+++ b/tests/sentry/replays/consumers/test_recording.py
@@ -581,3 +581,15 @@ def test_invalid_message(self):
             {"replay.consumer.separate-compute-and-io-org-ids": [self.organization.id]}
         ):
             super().test_invalid_message()
+
+
+from sentry.replays.consumers.buffered.consumer import recording_runtime
+from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory
+
+
+class RunTimeConsumerTestCase(RecordingTestCase):
+    def processing_factory(self):
+        return PlatformStrategyFactory(
+            flags={"max_buffer_length": 8, "max_buffer_wait": 1},
+            runtime=recording_runtime,
+        )

From 74e811f939510a89b24eeea400eabff5b2ed45b0 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:27:55 -0600
Subject: [PATCH 20/49] Remove buffer managers module

---
 .../consumers/buffered/buffer_managers.py     | 169 ------------------
 src/sentry/replays/models.py                  |  15 --
 2 files changed, 184 deletions(-)
 delete mode 100644 src/sentry/replays/consumers/buffered/buffer_managers.py

diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py
deleted file mode 100644
index 4cc4ecf8037040..00000000000000
--- a/src/sentry/replays/consumers/buffered/buffer_managers.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import dataclasses
-import uuid
-from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
-
-import sentry_sdk
-
-from sentry.replays.lib.storage import storage_kv
-from sentry.replays.models import FilePart
-from sentry.replays.usecases.ingest import (
-    ProcessedRecordingMessage,
-    commit_recording_message,
-    track_initial_segment_event,
-    track_recording_metadata,
-)
-from sentry.replays.usecases.ingest.dom_index import _initialize_publisher, emit_replay_actions
-
-
-@dataclasses.dataclass(frozen=True)
-class FilePartRow:
-    key: str
-    range_start: int
-    range_stop: int
-
-
-@dataclasses.dataclass(frozen=True)
-class MergedBuffer:
-    buffer: bytes
-    buffer_rows: list[FilePartRow]
-    filename: str
-
-
-class BatchedBufferManager:
-    """Batched buffer manager.
-
-    The batched buffer manager takes a list of messages and merges them into a single file before
-    committing them to permanent storage. We store filename and byte locations in a PostgreSQL
-    table.
-    """
-
-    @sentry_sdk.trace
-    def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
-        merged_buffer = self.merge_buffer(messages)
-        self.commit_merged_buffer(merged_buffer)
-        self.bulk_track_initial_segment_events(messages)
-        self.bulk_emit_action_events(messages)
-        self.bulk_track_recording_metadata(messages)
-
-    @sentry_sdk.trace
-    def merge_buffer(self, buffer: list[ProcessedRecordingMessage]) -> MergedBuffer:
-        def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]:
-            range_start = len(parts)
-            range_stop = range_start + len(part) - 1
-            return (parts + part, FilePartRow(key, range_start, range_stop))
-
-        parts = b""
-        parts_rows = []
-
-        for item in buffer:
-            # The recording data is appended to the buffer and a row for tracking is
-            # returned.
-            parts, part_row = _append_part(parts, item.filedata, key=item.filename)
-            parts_rows.append(part_row)
-
-        return MergedBuffer(
-            buffer=parts,
-            buffer_rows=parts_rows,
-            filename=f"90/{uuid.uuid4().hex}",
-        )
-
-    @sentry_sdk.trace
-    def commit_merged_buffer(self, buffer: MergedBuffer) -> None:
-        if buffer.buffer == b"":
-            return None
-
-        # Storage goes first. Headers go second. If we were to store the headers first we could
-        # expose a record that does not exist to the user which would ultimately 404.
-        storage_kv.set(buffer.filename, buffer.buffer)
-
-        # Bulk insert the filepart.
-        FilePart.objects.bulk_create(
-            FilePart(
-                filename=buffer.filename,
-                key=row.key,
-                range_start=row.range_start,
-                range_stop=row.range_stop,
-            )
-            for row in buffer.buffer_rows
-        )
-
-    @sentry_sdk.trace
-    def bulk_track_initial_segment_events(self, items: list[ProcessedRecordingMessage]) -> None:
-        # Each item in the buffer needs to record a billing outcome. Not every segment will bill
-        # but we defer that behavior to the `track_initial_segment_event` function.
-        for item in items:
-            track_initial_segment_event(
-                item.org_id,
-                item.project_id,
-                item.replay_id,
-                item.segment_id,
-                item.key_id,
-                item.received,
-                item.is_replay_video,
-            )
-
-    @sentry_sdk.trace
-    def bulk_emit_action_events(self, items: list[ProcessedRecordingMessage]) -> None:
-        # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The
-        # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed
-        # before we can consider this buffer fully flushed and commit our local offsets.
-        for item in items:
-            if item.actions_event:
-                emit_replay_actions(item.actions_event)
-
-        # Ensure the replay-actions were committed to the broker before we commit this batch.
-        publisher = _initialize_publisher()
-        publisher.flush()
-
-    @sentry_sdk.trace
-    def bulk_track_recording_metadata(self, items: list[ProcessedRecordingMessage]) -> None:
-        # Each item in the buffer needs to record metrics about itself.
-        for item in items:
-            track_recording_metadata(item)
-
-
-class ThreadedBufferManager:
-    """Threaded buffer manager.
-
-    The threaded buffer manager is the original way we uploaded files except in an application
-    managed thread-pool. We iterate over each processed recording, commit them in a thread, and
-    finally return null when all the tasks complete. It requires no changes elsewhere in the code
-    to accomodate.
-
-    The goal of this class is to _not_ be clever. We want to as closely as possible immitate
-    current production behavior. The reason being is that a consumer refactor is a difficult task
-    to feature flag and we want to gradually rollout the behavior over time after assuring
-    ourselves that there are no defects in the `BatchedBufferManager`.
-    """
-
-    def __init__(self, max_workers: int) -> None:
-        self.fn = commit_recording_message
-        self.max_workers = max_workers
-
-    @sentry_sdk.trace
-    def commit(self, messages: list[ProcessedRecordingMessage]) -> None:
-        # Use as many workers as necessary up to a limit. We don't want to start thousands of
-        # worker threads.
-        max_workers = min(len(messages), self.max_workers)
-
-        with ThreadPoolExecutor(max_workers=max_workers) as pool:
-            # We apply whatever function is defined on the class to each message in the list. This
-            # is useful for testing reasons (dependency injection).
-            futures = [pool.submit(self.fn, message) for message in messages]
-
-            # Tasks can fail. We check the done set for any failures. We will wait for all the
-            # futures to complete before running this step or eagerly run this step if any task
-            # errors.
-            done, _ = wait(futures, return_when=FIRST_EXCEPTION)
-            for future in done:
-                exc = future.exception()
-                if exc is not None:
-                    raise exc
-
-        # Recording metadata is not tracked in the threadpool. This is because this function will
-        # log. Logging will acquire a lock and make our threading less useful due to the speed of
-        # the I/O we do in this step.
-        for message in messages:
-            track_recording_metadata(message)
-
-        return None
diff --git a/src/sentry/replays/models.py b/src/sentry/replays/models.py
index b20cb45efe928e..1d4099d0155728 100644
--- a/src/sentry/replays/models.py
+++ b/src/sentry/replays/models.py
@@ -46,18 +46,3 @@ def delete(self, *args, **kwargs):
 
         rv = super().delete(*args, **kwargs)
         return rv
-
-
-@region_silo_model
-class FilePart(Model):
-    __relocation_scope__ = RelocationScope.Excluded
-
-    date_added = models.DateTimeField(default=timezone.now, null=False)
-    filename = models.CharField(null=False)
-    key = models.CharField(db_index=True, null=False)
-    range_start = models.IntegerField(null=False)
-    range_stop = models.IntegerField(null=False)
-
-    class Meta:
-        app_label = "replays"
-        db_table = "replays_filepart"

From 8f8e5ce2cd5a89ccd156c8399ff0006ba4ba2540 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:30:51 -0600
Subject: [PATCH 21/49] Test clean up

---
 tests/sentry/replays/consumers/test_recording.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py
index 90f551113adedf..8f5c6073b774a4 100644
--- a/tests/sentry/replays/consumers/test_recording.py
+++ b/tests/sentry/replays/consumers/test_recording.py
@@ -13,6 +13,8 @@
 from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
 from sentry.models.organizationonboardingtask import OnboardingTask, OnboardingTaskStatus
+from sentry.replays.consumers.buffered.consumer import recording_runtime
+from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory
 from sentry.replays.consumers.recording import ProcessReplayRecordingStrategyFactory
 from sentry.replays.consumers.recording_buffered import (
     RecordingBufferedStrategyFactory,
@@ -583,13 +585,9 @@ def test_invalid_message(self):
             super().test_invalid_message()
 
 
-from sentry.replays.consumers.buffered.consumer import recording_runtime
-from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory
-
-
-class RunTimeConsumerTestCase(RecordingTestCase):
+class BufferedRunTimeConsumerTestCase(RecordingTestCase):
     def processing_factory(self):
         return PlatformStrategyFactory(
-            flags={"max_buffer_length": 8, "max_buffer_wait": 1},
+            flags={"max_buffer_length": 8, "max_buffer_wait": 1, "max_workers": 8},
             runtime=recording_runtime,
         )

From fac3204d830a4a615bf5b3a6557c97ad8e28f817 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:40:39 -0600
Subject: [PATCH 22/49] Fix unit test

---
 tests/sentry/replays/unit/consumers/test_recording.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 22bb3701c74f7a..0872ee2184c2c2 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -1,10 +1,12 @@
 import time
 import uuid
+import zlib
 
 import msgpack
 
 from sentry.replays.consumers.buffered.consumer import process_message
 from sentry.replays.usecases.ingest import ProcessedRecordingMessage
+from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
 
 
 def test_process_message_invalid():
@@ -29,10 +31,12 @@ def test_process_message():
             }
         )
     )
+
+    expected_event_metadata = ParsedEventMeta([], [], [], [], [], [])
     assert result is not None
     assert result == ProcessedRecordingMessage(
-        actions_event=[],
-        filedata=b"[]",
+        actions_event=expected_event_metadata,
+        filedata=zlib.compress(b"[]"),
         filename=result.filename,
         is_replay_video=False,
         key_id=3,
@@ -41,7 +45,9 @@ def test_process_message():
         received=result.received,
         recording_size_uncompressed=2,
         recording_size=result.recording_size,
+        retention_days=30,
         replay_id=result.replay_id,
         segment_id=0,
         video_size=None,
+        replay_event=None,
     )

From ae59eb9439f8cc9d6881edeb2285d75dcb753e02 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:41:39 -0600
Subject: [PATCH 23/49] Add explicit return

---
 src/sentry/replays/consumers/buffered/consumer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 9a4fa0339de0ed..2220960d4813d2 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -100,6 +100,7 @@ def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None:
         with contextlib.suppress(DropSilently):
             message = parse_recording_message(message_bytes)
             return process_recording_message(message)
+        return None
 
 
 def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:

From a6a32e52b3531eba361b7d5e57ef7531a387dce9 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:42:57 -0600
Subject: [PATCH 24/49] Fix typing

---
 tests/sentry/replays/consumers/test_recording.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py
index 8f5c6073b774a4..55ddb7d1cc79c1 100644
--- a/tests/sentry/replays/consumers/test_recording.py
+++ b/tests/sentry/replays/consumers/test_recording.py
@@ -588,6 +588,6 @@ def test_invalid_message(self):
 class BufferedRunTimeConsumerTestCase(RecordingTestCase):
     def processing_factory(self):
         return PlatformStrategyFactory(
-            flags={"max_buffer_length": 8, "max_buffer_wait": 1, "max_workers": 8},
+            flags={"max_buffer_length": "8", "max_buffer_wait": "1", "max_workers": "8"},
             runtime=recording_runtime,
         )

From f1026ca2574f8fc79a312581ff67e1ffc4419b9b Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:54:32 -0600
Subject: [PATCH 25/49] Remove unused option

---
 src/sentry/options/defaults.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
index 210801aa64d3dd..85cd3215d0792a 100644
--- a/src/sentry/options/defaults.py
+++ b/src/sentry/options/defaults.py
@@ -456,12 +456,6 @@
     default=None,
     flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
 )
-register(
-    "replay.consumer.use-file-batching",
-    type=Sequence,
-    default=[],
-    flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
-)
 # Globally disables replay-video.
 register(
     "replay.replay-video.disabled",

From b363596c99423c60eec401cee343e8f701846cff Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 09:56:15 -0600
Subject: [PATCH 26/49] Reset dom_index module to align with master

---
 .../replays/usecases/ingest/dom_index.py      | 21 ++-----------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/sentry/replays/usecases/ingest/dom_index.py b/src/sentry/replays/usecases/ingest/dom_index.py
index ecf0f5eb0ae340..b54ba8535ddf63 100644
--- a/src/sentry/replays/usecases/ingest/dom_index.py
+++ b/src/sentry/replays/usecases/ingest/dom_index.py
@@ -63,22 +63,6 @@ class ReplayActionsEvent(TypedDict):
     type: Literal["replay_event"]
 
 
-def parse_and_emit_replay_actions(
-    project: Project,
-    replay_id: str,
-    retention_days: int,
-    segment_data: list[dict[str, Any]],
-    replay_event: dict[str, Any] | None,
-    org_id: int | None = None,
-) -> None:
-    with metrics.timer("replays.usecases.ingest.dom_index.parse_and_emit_replay_actions"):
-        message = parse_replay_actions(
-            project, replay_id, retention_days, segment_data, replay_event, org_id=org_id
-        )
-        if message is not None:
-            emit_replay_actions(message)
-
-
 @sentry_sdk.trace
 def emit_replay_actions(action: ReplayActionsEvent) -> None:
     publisher = _initialize_publisher()
@@ -237,14 +221,13 @@ def _get_testid(container: dict[str, str]) -> str:
     )
 
 
-def _initialize_publisher(asynchronous: bool = True) -> KafkaPublisher:
+def _initialize_publisher() -> KafkaPublisher:
     global replay_publisher
 
     if replay_publisher is None:
         config = kafka_config.get_topic_definition(Topic.INGEST_REPLAY_EVENTS)
         replay_publisher = KafkaPublisher(
-            kafka_config.get_kafka_producer_cluster_options(config["cluster"]),
-            asynchronous=asynchronous,
+            kafka_config.get_kafka_producer_cluster_options(config["cluster"])
         )
 
     return replay_publisher

From c656420452f32a4e0fe7a02ed2e8aa100b34260f Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 10:17:02 -0600
Subject: [PATCH 27/49] Update buffering run-time coverage

---
 .../unit/consumers/test_buffering_runtime.py  | 60 +++++++++++++++++--
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
index 3ee061e764f3bb..6802ef3f9c7ab7 100644
--- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
@@ -1,3 +1,4 @@
+import pytest
 from arroyo.types import Partition, Topic
 
 from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
@@ -11,16 +12,27 @@
 def buffer_runtime(sink):
     return buffering_runtime(
         init_fn=lambda _: Model(
+            # The buffer is an arbitrary list of items but it could be anything. For example a
+            # string which you could perpetually append to.
             buffer=[],
+            # Commit when the buffer has 5 messages in it.
             can_flush=lambda m: len(m.buffer) == 5,
+            # Flush in this case is just pushing the buffer into a mock class which holds the
+            # messages in memory. This is easy for us to test and we don't need to mock out any
+            # service code which is both unknown to the RunTime and can be tested independently.
             do_flush=lambda m: sink.accept(m.buffer),
+            # This is the offsets tracking object. The RunTime manages appending.
             offsets={},
         ),
+        # Our process function is a simple lambda which checks that the message is a valid
+        # integer and is less than 5. There's no exception handling here. That could crash the
+        # consumer! Its up to the application developer to manage.
         process_fn=lambda m: int(m) if int(m) < 5 else None,
     )
 
 
 def test_runtime_setup():
+    """Test RunTime initialization."""
     runtime = buffer_runtime(MockSink())
     runtime.setup({}, commit=MockCommit())
     assert runtime.model.buffer == []
@@ -28,6 +40,11 @@ def test_runtime_setup():
 
 
 def test_buffering_runtime_submit():
+    """Test message recived from the platform.
+
+    Messages are buffered up until a point (see the RunTime definition at the top of the module).
+    When we cross the threshold the RunTime should order a buffer flush action.
+    """
     mock_commit = MockCommit()
     sink = MockSink()
 
@@ -66,7 +83,29 @@ def test_buffering_runtime_submit():
     assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
 
 
-def test_buffering_runtime_publish():
+def test_buffering_runtime_submit_invalid_message():
+    """Test invalid message recived from the platform."""
+    mock_commit = MockCommit()
+    sink = MockSink()
+
+    runtime = buffer_runtime(sink)
+    runtime.setup({}, mock_commit)
+    assert runtime.model.buffer == []
+    assert runtime.model.offsets == {}
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
+
+    # Assert exceptional behavior is not handled.
+    with pytest.raises(ValueError):
+        runtime.submit(make_kafka_message(b"hello"))
+
+
+def test_buffering_runtime_join():
+    """Test the RunTime received a join message from the platform.
+
+    When the consumer is ordered to shutdown a "join" message will be submitted to the RunTime.
+    When that happens we rush to flush the buffer and commit whatever we can.
+    """
     mock_commit = MockCommit()
     sink = MockSink()
 
@@ -89,9 +128,22 @@ def test_buffering_runtime_publish():
     assert sink.accepted == [1, 1, 1]
     assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
 
-    # Reset the mocks.
-    mock_commit.commit = {}
-    sink.accepted = []
+
+def test_buffering_runtime_poll():
+    """Test the RunTime received a poll message from the platform.
+
+    This will happen periodically. In the event the consumer does not receive messages for some
+    interval the poll message will be published will be called.
+    """
+    mock_commit = MockCommit()
+    sink = MockSink()
+
+    runtime = buffer_runtime(sink)
+    runtime.setup({}, mock_commit)
+    assert runtime.model.buffer == []
+    assert runtime.model.offsets == {}
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
 
     # Assert poll does not automatically empty the buffer.
     runtime.submit(make_kafka_message(b"1"))

From 29bb826ddfbfc322f65d473a631e38bc2e6aea88 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 10:52:15 -0600
Subject: [PATCH 28/49] Update test ordering

---
 tests/sentry/replays/unit/consumers/test_recording.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 0872ee2184c2c2..162dfdf065d633 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -9,11 +9,6 @@
 from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
 
 
-def test_process_message_invalid():
-    result = process_message(msgpack.packb(b"hello, world!"))
-    assert result is None
-
-
 def test_process_message():
     result = process_message(
         msgpack.packb(
@@ -51,3 +46,8 @@ def test_process_message():
         video_size=None,
         replay_event=None,
     )
+
+
+def test_process_message_invalid():
+    result = process_message(msgpack.packb(b"hello, world!"))
+    assert result is None

From b4c64775df4ba1d4c9b69848b7f155834e7a0909 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 11:47:49 -0600
Subject: [PATCH 29/49] Add offset committing test

---
 .../unit/consumers/test_buffering_runtime.py  | 20 +++++++++++++++++++
 .../replays/unit/consumers/test_helpers.py    |  6 ++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
index 6802ef3f9c7ab7..4f68108fc9e332 100644
--- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
@@ -162,3 +162,23 @@ def test_buffering_runtime_poll():
     assert runtime.model.buffer == []
     assert sink.accepted == [1, 2, 3, 4, 5]
     assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
+
+
+def test_buffering_runtime_committed_offsets():
+    """Test offset commit."""
+    mock_commit = MockCommit()
+    sink = MockSink()
+
+    runtime = buffer_runtime(sink)
+    runtime.setup({}, mock_commit)
+    assert runtime.model.buffer == []
+    assert runtime.model.offsets == {}
+    assert sink.accepted == []
+    assert mock_commit.commit == {}
+
+    # Assert committed offsets are correct.
+    runtime.submit(make_kafka_message(b"1"))
+    runtime.submit(make_kafka_message(b"1", topic="b"))
+    runtime.submit(make_kafka_message(b"1", offset=2))
+    runtime.publish("join")
+    assert mock_commit.commit == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2}
diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py
index 1f4eecbc060674..3543885e55b07b 100644
--- a/tests/sentry/replays/unit/consumers/test_helpers.py
+++ b/tests/sentry/replays/unit/consumers/test_helpers.py
@@ -21,7 +21,9 @@ def accept(self, buffer):
         self.accepted.extend(buffer)
 
 
-def make_kafka_message(message: bytes):
+def make_kafka_message(message: bytes, topic: str = "a", index: int = 1, offset: int = 1):
     return Message(
-        BrokerValue(KafkaPayload(b"k", message, []), Partition(Topic("a"), 1), 1, datetime.now())
+        BrokerValue(
+            KafkaPayload(b"k", message, []), Partition(Topic(topic), index), offset, datetime.now()
+        )
     )

From 2572955d773d2d71f7b32c3719ed8285e19a611d Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 14:37:19 -0600
Subject: [PATCH 30/49] Add docs

---
 src/sentry/replays/consumers/buffered/lib.py  |  6 ++-
 .../replays/consumers/buffered/platform.py    | 42 ++++++++++++-------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index ff89587a54ce7b..2b69094f109aa9 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -1,4 +1,8 @@
-"""Buffered RunTime implementation."""
+"""Buffered RunTime implementation.
+
+The Buffering RunTime eagerly processes messages as they are received and waits until its buffer
+is full before flushing those messages. The goal being to achieve efficiencies from batched I/O.
+"""
 
 from collections.abc import Callable, MutableMapping
 from dataclasses import dataclass
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index 252377c3f3b2a2..fda10dfa8579f9 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -70,22 +70,20 @@ def join(self, timeout: float | None = None) -> None:
 
 
 CommitProtocol = Callable[[Mapping[Partition, int]], None]
-Model = TypeVar("Model")
-Msg = TypeVar("Msg")
-
 
-class BackPressure:
-    """Instructs the RunTime to back-pressure the platform.
-
-    Does not accept a `msg` argument as we do not expect the platform to fail.
-    """
+# A Model represents the state of your application. It is a type variable and the RunTime is
+# generic over it. Your state can be anything from a simple integer to a large class with many
+# fields.
+Model = TypeVar("Model")
 
-    pass
+# A Msg represents the commands an application can issue to itself. These commands update the state
+# and optionally issue commands to the RunTime.
+Msg = TypeVar("Msg")
 
 
 @dataclass(frozen=True)
 class Commit(Generic[Msg]):
-    """Instructs the RunTime to commit the platform.
+    """Instructs the RunTime to commit the message offsets.
 
     Because the RunTime is based on a Kafka platform there is an expectation that the application
     should trigger commits when necessary. While the application can't be sure its running Kafka
@@ -109,8 +107,10 @@ class Task(Generic[Msg]):
     msg: Msg
 
 
-# A "Cmd" is the union of all the commands an application can issue to the RunTime.
-Cmd = BackPressure | Commit[Msg] | Nothing | Task[Msg]
+# A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime
+# accepts these commands and handles them in some pre-defined way. Commands are fixed and can not
+# be registered on a per application basis.
+Cmd = Commit[Msg] | Nothing | Task[Msg]
 
 
 @dataclass(frozen=True)
@@ -138,11 +138,22 @@ class Poll(Generic[Msg]):
     name = "poll"
 
 
-# A "Sub" is the union of all the events an application can subscribe to.
+# A "Sub" is the union of all the commands the Platform can issue to an application. The Platform
+# will occassionally emit actions which are intercepted by the RunTime. The RunTime translates
+# these actions into a set of predefined subscriptions. These subscriptions are exposed to the
+# application and the developer is free to handle them in the way they see fit.
 Sub = Join[Msg] | Poll[Msg]
 
 
 class RunTime(Generic[Model, Msg]):
+    """RunTime object.
+
+    The RunTime is an intermediate data structure which manages communication between the platform
+    and the application. It formalizes state transformations and abstracts the logic of the
+    platform. Commands are declaratively issued rather than defined within the logic of the
+    application. Commands can be issued bidirectionally with "Cmd" types flowing from the
+    application to the platform and "Sub" types flowing from the platform to the application.
+    """
 
     def __init__(
         self,
@@ -171,6 +182,8 @@ def model(self) -> Model:
         return self.__model
 
     def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None:
+        # NOTE: Could this be a factory function that produces RunTimes instead? That way we
+        #       don't need the assert checks on model and commit.
         self.__commit = commit
 
         model, cmd = self.init(flags)
@@ -212,9 +225,6 @@ def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
             return None
 
         match cmd:
-            case BackPressure():
-                # TODO
-                return None
             case Commit(msg=msg, offsets=offsets):
                 self.commit(offsets)
                 return self.__handle_msg(msg)

From 7ca74623708238103c9954c1377038294690635e Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 15:03:37 -0600
Subject: [PATCH 31/49] More docstring fixes

---
 src/sentry/replays/consumers/buffered/consumer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 2220960d4813d2..a0c03264f36692 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,3 +1,5 @@
+"""Session Replay recording consumer implementation."""
+
 import contextlib
 import time
 from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
@@ -61,8 +63,6 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> N
         return None
 
     with ThreadPoolExecutor(max_workers=max_workers) as pool:
-        # We apply whatever function is defined on the class to each message in the list. This
-        # is useful for testing reasons (dependency injection).
         futures = [pool.submit(flush_message, message) for message in model.buffer]
 
         # Tasks can fail. We check the done set for any failures. We will wait for all the
@@ -85,7 +85,6 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> N
 
 @sentry_sdk.trace
 def flush_message(message: ProcessedRecordingMessage) -> None:
-    """Message flushing function."""
     with contextlib.suppress(DropSilently):
         commit_recording_message(message)
 

From fb5731e143827bab2ad9e2667cbfe7fb774648a7 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 15:40:41 -0600
Subject: [PATCH 32/49] Add typing to flags and factory module

---
 .../replays/consumers/buffered/consumer.py    | 18 +++++++----
 .../replays/consumers/buffered/factory.py     | 26 +++++++++++++++
 src/sentry/replays/consumers/buffered/lib.py  |  9 +++---
 .../replays/consumers/buffered/platform.py    | 32 +++++--------------
 4 files changed, 51 insertions(+), 34 deletions(-)
 create mode 100644 src/sentry/replays/consumers/buffered/factory.py

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index a0c03264f36692..f554538fa189fc 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -3,6 +3,7 @@
 import contextlib
 import time
 from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
+from typing import TypedDict
 
 import sentry_sdk
 
@@ -18,6 +19,12 @@
 )
 
 
+class Flags(TypedDict):
+    max_buffer_length: int
+    max_buffer_wait: int
+    max_workers: int
+
+
 class BufferManager:
     """Buffer manager.
 
@@ -35,11 +42,10 @@ class BufferManager:
     if you wanted to expose the state across more locations in the application.
     """
 
-    def __init__(self, flags: dict[str, str]) -> None:
-        # Flags are safely extracted and default arguments are used.
-        self.__max_buffer_length = int(flags.get("max_buffer_length", 8))
-        self.__max_buffer_wait = int(flags.get("max_buffer_wait", 1))
-        self.__max_workers = int(flags.get("max_workers", 8))
+    def __init__(self, flags: Flags) -> None:
+        self.__max_buffer_length = flags["max_buffer_length"]
+        self.__max_buffer_wait = flags["max_buffer_wait"]
+        self.__max_workers = flags["max_workers"]
 
         self.__last_flushed_at = time.time()
 
@@ -102,7 +108,7 @@ def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None:
         return None
 
 
-def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]:
+def init(flags: Flags) -> Model[ProcessedRecordingMessage]:
     """Return the initial state of the application."""
     buffer = BufferManager(flags)
     return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py
new file mode 100644
index 00000000000000..036ee4868342ad
--- /dev/null
+++ b/src/sentry/replays/consumers/buffered/factory.py
@@ -0,0 +1,26 @@
+from collections.abc import Mapping
+
+from arroyo.backends.kafka.consumer import KafkaPayload
+from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory
+from arroyo.types import Commit as ArroyoCommit
+from arroyo.types import Partition
+
+from sentry.replays.consumers.buffered.consumer import Flags, recording_runtime
+from sentry.replays.consumers.buffered.platform import PlatformStrategy
+
+
+class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
+
+    def __init__(self, max_buffer_length: int, max_buffer_wait: int, max_workers: int) -> None:
+        self.flags: Flags = {
+            "max_buffer_length": max_buffer_length,
+            "max_buffer_wait": max_buffer_wait,
+            "max_workers": max_workers,
+        }
+
+    def create_with_partitions(
+        self,
+        commit: ArroyoCommit,
+        partitions: Mapping[Partition, int],
+    ) -> ProcessingStrategy[KafkaPayload]:
+        return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_runtime)
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index 2b69094f109aa9..9c464415465ad1 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -14,6 +14,7 @@
 from sentry.replays.consumers.buffered.platform import (
     Cmd,
     Commit,
+    Flags,
     Join,
     Nothing,
     Poll,
@@ -78,8 +79,8 @@ def process(
 
 
 def init(
-    init_fn: Callable[[dict[str, str]], Model[Item]],
-    flags: dict[str, str],
+    init_fn: Callable[[Flags], Model[Item]],
+    flags: Flags,
 ) -> tuple[Model[Item], Cmd[Msg[Item]]]:
     return (init_fn(flags), Nothing())
 
@@ -119,9 +120,9 @@ def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]:
 
 
 def buffering_runtime(
-    init_fn: Callable[[dict[str, str]], Model[Item]],
+    init_fn: Callable[[Flags], Model[Item]],
     process_fn: Callable[[bytes], Item | None],
-) -> RunTime[Model[Item], Msg[Item]]:
+) -> RunTime[Model[Item], Msg[Item], Flags]:
     return RunTime(
         init=partial(init, init_fn),
         process=partial(process, process_fn),
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index fda10dfa8579f9..f1819d6ee7fe7c 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -4,36 +4,18 @@
 from typing import Generic, TypeVar
 
 from arroyo.backends.kafka.consumer import KafkaPayload
-from arroyo.processing.strategies import (
-    CommitOffsets,
-    ProcessingStrategy,
-    ProcessingStrategyFactory,
-)
+from arroyo.processing.strategies import CommitOffsets, ProcessingStrategy
 from arroyo.types import Commit as ArroyoCommit
 from arroyo.types import Message, Partition, Value
 
 
-class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
-
-    def __init__(self, flags: dict[str, str], runtime: "RunTime[Model, Msg]") -> None:
-        self.flags = flags
-        self.runtime = runtime
-
-    def create_with_partitions(
-        self,
-        commit: ArroyoCommit,
-        partitions: Mapping[Partition, int],
-    ) -> ProcessingStrategy[KafkaPayload]:
-        return PlatformStrategy(commit=commit, flags=self.flags, runtime=self.runtime)
-
-
 class PlatformStrategy(ProcessingStrategy[KafkaPayload]):
 
     def __init__(
         self,
         commit: ArroyoCommit,
-        flags: dict[str, str],
-        runtime: "RunTime[Model, Msg]",
+        flags: "Flags",
+        runtime: "RunTime[Model, Msg, Flags]",
     ) -> None:
         # The RunTime is made aware of the commit strategy. It will
         # submit the partition, offset mapping it wants committed.
@@ -80,6 +62,8 @@ def join(self, timeout: float | None = None) -> None:
 # and optionally issue commands to the RunTime.
 Msg = TypeVar("Msg")
 
+Flags = TypeVar("Flags")
+
 
 @dataclass(frozen=True)
 class Commit(Generic[Msg]):
@@ -145,7 +129,7 @@ class Poll(Generic[Msg]):
 Sub = Join[Msg] | Poll[Msg]
 
 
-class RunTime(Generic[Model, Msg]):
+class RunTime(Generic[Model, Msg, Flags]):
     """RunTime object.
 
     The RunTime is an intermediate data structure which manages communication between the platform
@@ -157,7 +141,7 @@ class RunTime(Generic[Model, Msg]):
 
     def __init__(
         self,
-        init: Callable[[dict[str, str]], tuple[Model, Cmd[Msg] | None]],
+        init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]],
         process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None],
         subscription: Callable[[Model], list[Sub[Msg]]],
         update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]],
@@ -181,7 +165,7 @@ def model(self) -> Model:
         assert self.__model is not None
         return self.__model
 
-    def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None:
+    def setup(self, flags: Flags, commit: CommitProtocol) -> None:
         # NOTE: Could this be a factory function that produces RunTimes instead? That way we
         #       don't need the assert checks on model and commit.
         self.__commit = commit

From 17eb0118857bf955ecb090da924e8fe7e8a19041 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Thu, 27 Feb 2025 19:55:13 -0600
Subject: [PATCH 33/49] Adopt buffered strategy in callsite

---
 bin/mock-replay-recording        | 75 ++++++++++++++++++++++++++++++++
 src/sentry/consumers/__init__.py | 25 +++--------
 2 files changed, 81 insertions(+), 19 deletions(-)
 create mode 100755 bin/mock-replay-recording

diff --git a/bin/mock-replay-recording b/bin/mock-replay-recording
new file mode 100755
index 00000000000000..dd4fd11c9d577d
--- /dev/null
+++ b/bin/mock-replay-recording
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+""".
+
+Helpful commands:
+
+    - Check if offsets are committed correctly.
+        - `docker exec -it kafka-kafka-1 kafka-get-offsets --bootstrap-server localhost:9092 --topic ingest-replay-recordings`
+        - Returns in the form of <topic>:<partition-id>:<offset>
+"""
+from sentry.runner import configure
+
+configure()
+import logging
+import os
+import time
+import uuid
+
+import click
+from arroyo import Topic as ArroyoTopic
+from arroyo.backends.kafka import KafkaPayload, KafkaProducer, build_kafka_configuration
+from sentry_kafka_schemas.codecs import Codec
+from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sentry.conf.server")
+
+import django
+
+django.setup()
+
+from sentry.conf.types.kafka_definition import Topic, get_topic_codec
+from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition
+
+logger = logging.getLogger(__name__)
+
+
+def get_producer() -> KafkaProducer:
+    cluster_name = get_topic_definition(Topic.INGEST_REPLAYS_RECORDINGS)["cluster"]
+    producer_config = get_kafka_producer_cluster_options(cluster_name)
+    return KafkaProducer(build_kafka_configuration(default_config=producer_config))
+
+
+RECORDING_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
+
+
+@click.command()
+@click.option("--organization-id", type=int, required=True, help="Organization ID")
+@click.option("--project-id", type=int, required=True, help="Project ID")
+def main(organization_id: int, project_id: int) -> None:
+    """Produce a mock uptime result message to the INGEST_REPLAYS_RECORDINGS topic."""
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": organization_id,
+        "payload": b'{"segment_id":10}\n[]',
+        "project_id": project_id,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
+
+    producer = get_producer()
+    topic = get_topic_definition(Topic.INGEST_REPLAYS_RECORDINGS)["real_topic_name"]
+    payload = KafkaPayload(None, RECORDING_CODEC.encode(message), [])
+
+    producer.produce(ArroyoTopic(topic), payload)
+    producer.close()
+
+    logger.info("Successfully produced message to %s", topic)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/sentry/consumers/__init__.py b/src/sentry/consumers/__init__.py
index f3e5aa93484434..785a2c452877e5 100644
--- a/src/sentry/consumers/__init__.py
+++ b/src/sentry/consumers/__init__.py
@@ -84,24 +84,11 @@ def ingest_replay_recordings_options() -> list[click.Option]:
 
 def ingest_replay_recordings_buffered_options() -> list[click.Option]:
     """Return a list of ingest-replay-recordings-buffered options."""
-    options = [
-        click.Option(
-            ["--max-buffer-message-count", "max_buffer_message_count"],
-            type=int,
-            default=100,
-        ),
-        click.Option(
-            ["--max-buffer-size-in-bytes", "max_buffer_size_in_bytes"],
-            type=int,
-            default=2_500_000,
-        ),
-        click.Option(
-            ["--max-buffer-time-in-seconds", "max_buffer_time_in_seconds"],
-            type=int,
-            default=1,
-        ),
+    return [
+        click.Option(["--max-buffer-length", "max_buffer_length"], type=int, default=8),
+        click.Option(["--max-buffer-wait", "max_buffer_wait"], type=int, default=1),
+        click.Option(["--max-workers", "max_workers"], type=int, default=8),
     ]
-    return options
 
 
 def ingest_monitors_options() -> list[click.Option]:
@@ -269,8 +256,8 @@ def ingest_transactions_options() -> list[click.Option]:
     },
     "ingest-replay-recordings": {
         "topic": Topic.INGEST_REPLAYS_RECORDINGS,
-        "strategy_factory": "sentry.replays.consumers.recording.ProcessReplayRecordingStrategyFactory",
-        "click_options": ingest_replay_recordings_options(),
+        "strategy_factory": "sentry.replays.consumers.buffered.factory.PlatformStrategyFactory",
+        "click_options": ingest_replay_recordings_buffered_options(),
     },
     "ingest-replay-recordings-buffered": {
         "topic": Topic.INGEST_REPLAYS_RECORDINGS,

From 0c79eac10326f8ca7651316b9cbc191ec7b1e6b2 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 09:29:14 -0600
Subject: [PATCH 34/49] Add script for mocking recordings

---
 bin/mock-replay-recording | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bin/mock-replay-recording b/bin/mock-replay-recording
index dd4fd11c9d577d..2f2d27109ace96 100755
--- a/bin/mock-replay-recording
+++ b/bin/mock-replay-recording
@@ -3,9 +3,10 @@
 
 Helpful commands:
 
+    - Run the consumer.
+        - `sentry run consumer ingest-replay-recordings --consumer-group 0`
     - Check if offsets are committed correctly.
-        - `docker exec -it kafka-kafka-1 kafka-get-offsets --bootstrap-server localhost:9092 --topic ingest-replay-recordings`
-        - Returns in the form of <topic>:<partition-id>:<offset>
+        - `docker exec -it kafka-kafka-1 kafka-consumer-groups --bootstrap-server localhost:9092 --describe --group 0`
 """
 from sentry.runner import configure
 
@@ -50,7 +51,7 @@ def main(organization_id: int, project_id: int) -> None:
     message: ReplayRecording = {
         "key_id": None,
         "org_id": organization_id,
-        "payload": b'{"segment_id":10}\n[]',
+        "payload": b'{"segment_id"',
         "project_id": project_id,
         "received": int(time.time()),
         "replay_event": None,

From 30ed5d645bcb5d6f72b11090d86f26b863daedee Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 09:30:09 -0600
Subject: [PATCH 35/49] Add handling for appending offsets when the message is
 not buffered

---
 src/sentry/replays/consumers/buffered/lib.py | 37 ++++++++++++--------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index 9c464415465ad1..fa6d7f4b0dded8 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -43,6 +43,11 @@ class Append(Generic[Item]):
     offset: MutableMapping[Partition, int]
 
 
+@dataclass(frozen=True)
+class AppendOffset:
+    offset: MutableMapping[Partition, int]
+
+
 class Committed:
     """The platform committed offsets. Our buffer is now completely done."""
 
@@ -61,8 +66,12 @@ class Polled:
     pass
 
 
+class TryFlush:
+    pass
+
+
 # A "Msg" is the union of all application messages our RunTime will accept.
-Msg = Append[Item] | Committed | Flush | Polled
+Msg = Append[Item] | AppendOffset | Committed | Flush | Polled | TryFlush
 
 
 def process(
@@ -70,12 +79,12 @@ def process(
     model: Model[Item],
     message: bytes,
     offset: MutableMapping[Partition, int],
-) -> Msg[Item] | None:
+) -> Msg[Item]:
     item = process_fn(message)
     if item:
         return Append(item=item, offset=offset)
     else:
-        return None
+        return AppendOffset(offset=offset)
 
 
 def init(
@@ -90,26 +99,26 @@ def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Ite
         case Append(item=item, offset=offset):
             model.buffer.append(item)
             model.offsets.update(offset)
-            if model.can_flush(model):
-                return (model, Task(msg=Flush()))
-            else:
-                return (model, None)
+            return (model, Task(msg=TryFlush()))
+        case AppendOffset(offset=offset):
+            model.offsets.update(offset)
+            return (model, Task(msg=TryFlush()))
+        case Committed():
+            return (model, None)
         case Flush():
-            # What should happen if we fail? If you raise an exception the platform will restart
-            # from the last checkpoint -- which is standard behavior. We could be more clever here
-            # and provide error handling facilities or we could accept that this problem gets too
-            # complicated to reasonably abstract and have developers implement their own buffering
-            # consumer.
             model.do_flush(model)
             model.buffer = []
             return (model, Commit(msg=Committed(), offsets=model.offsets))
-        case Committed():
-            return (model, None)
         case Polled():
             if model.can_flush(model):
                 return (model, Task(msg=Flush()))
             else:
                 return (model, None)
+        case TryFlush():
+            if model.can_flush(model):
+                return (model, Task(msg=Flush()))
+            else:
+                return (model, None)
 
 
 def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]:

From 52d8616519c380d3f42e161a203adf1e2f2a214f Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 09:45:29 -0600
Subject: [PATCH 36/49] Add commit coverage

---
 .../replays/unit/consumers/test_recording.py  | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 162dfdf065d633..09b49bda310df9 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -3,10 +3,18 @@
 import zlib
 
 import msgpack
+from arroyo.types import Partition
+from arroyo.types import Topic as ArroyoTopic
+from sentry_kafka_schemas.codecs import Codec
+from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
-from sentry.replays.consumers.buffered.consumer import process_message
+from sentry.conf.types.kafka_definition import Topic, get_topic_codec
+from sentry.replays.consumers.buffered.consumer import process_message, recording_runtime
 from sentry.replays.usecases.ingest import ProcessedRecordingMessage
 from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
+from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
+
+RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
 
 
 def test_process_message():
@@ -51,3 +59,33 @@ def test_process_message():
 def test_process_message_invalid():
     result = process_message(msgpack.packb(b"hello, world!"))
     assert result is None
+
+
+def test_commit_invalid_message():
+    """Assert invalid messages have their offsets staged for commit."""
+    mock_commit = MockCommit()
+
+    recording_runtime.setup(
+        {"max_buffer_length": 100, "max_buffer_wait": 100, "max_workers": 1}, mock_commit
+    )
+
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": 1,
+        "payload": b"invalid",
+        "project_id": 1,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
+
+    message = RECORDINGS_CODEC.encode(message)
+    kafka_message = make_kafka_message(message)
+
+    recording_runtime.submit(kafka_message)
+    assert recording_runtime.model.offsets == {Partition(ArroyoTopic("a"), 1): 2}
+    assert mock_commit.commit == {}

From b3344007c66dd3e809d364fc4d3f7ea5e39c5633 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 09:49:33 -0600
Subject: [PATCH 37/49] Assert messages are committed regardless of if they're
 appended to the buffer

---
 .../sentry/replays/unit/consumers/test_buffering_runtime.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
index 4f68108fc9e332..ce65f9ed076318 100644
--- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
@@ -178,7 +178,9 @@ def test_buffering_runtime_committed_offsets():
 
     # Assert committed offsets are correct.
     runtime.submit(make_kafka_message(b"1"))
-    runtime.submit(make_kafka_message(b"1", topic="b"))
-    runtime.submit(make_kafka_message(b"1", offset=2))
+    runtime.submit(make_kafka_message(b"10", topic="b"))
+    runtime.submit(make_kafka_message(b"100", offset=2))
+    assert runtime.model.buffer == [1]
+    assert runtime.model.offsets == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2}
     runtime.publish("join")
     assert mock_commit.commit == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2}

From 8618e296c9ff7eff5abe92b5ae7654e31f1fb310 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 10:22:50 -0600
Subject: [PATCH 38/49] Fix typing

---
 tests/sentry/replays/consumers/test_recording.py      | 8 ++------
 tests/sentry/replays/unit/consumers/test_recording.py | 9 ++++-----
 tests/sentry/replays/unit/consumers/test_runtime.py   | 8 ++++----
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py
index 55ddb7d1cc79c1..627de49ac7046d 100644
--- a/tests/sentry/replays/consumers/test_recording.py
+++ b/tests/sentry/replays/consumers/test_recording.py
@@ -13,8 +13,7 @@
 from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
 from sentry.models.organizationonboardingtask import OnboardingTask, OnboardingTaskStatus
-from sentry.replays.consumers.buffered.consumer import recording_runtime
-from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory
+from sentry.replays.consumers.buffered.factory import PlatformStrategyFactory
 from sentry.replays.consumers.recording import ProcessReplayRecordingStrategyFactory
 from sentry.replays.consumers.recording_buffered import (
     RecordingBufferedStrategyFactory,
@@ -587,7 +586,4 @@ def test_invalid_message(self):
 
 class BufferedRunTimeConsumerTestCase(RecordingTestCase):
     def processing_factory(self):
-        return PlatformStrategyFactory(
-            flags={"max_buffer_length": "8", "max_buffer_wait": "1", "max_workers": "8"},
-            runtime=recording_runtime,
-        )
+        return PlatformStrategyFactory(max_buffer_length=8, max_buffer_wait=1, max_workers=8)
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 09b49bda310df9..fbbb9f50839a2c 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -5,7 +5,6 @@
 import msgpack
 from arroyo.types import Partition
 from arroyo.types import Topic as ArroyoTopic
-from sentry_kafka_schemas.codecs import Codec
 from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
 from sentry.conf.types.kafka_definition import Topic, get_topic_codec
@@ -14,7 +13,7 @@
 from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
 from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
 
-RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
+RECORDINGS_CODEC = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
 
 
 def test_process_message():
@@ -72,7 +71,7 @@ def test_commit_invalid_message():
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b"invalid",
+        "payload": b"invalid",  # type: ignore[typeddict-item]
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
@@ -83,8 +82,8 @@ def test_commit_invalid_message():
         "version": 1,
     }
 
-    message = RECORDINGS_CODEC.encode(message)
-    kafka_message = make_kafka_message(message)
+    message_bytes = RECORDINGS_CODEC.encode(message)
+    kafka_message = make_kafka_message(message_bytes)
 
     recording_runtime.submit(kafka_message)
     assert recording_runtime.model.offsets == {Partition(ArroyoTopic("a"), 1): 2}
diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py
index 179d5cf5760471..c8ecfdbefabb64 100644
--- a/tests/sentry/replays/unit/consumers/test_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_runtime.py
@@ -2,7 +2,7 @@
 from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
 
 
-def counter_runtime() -> RunTime[int, str]:
+def counter_runtime() -> RunTime[int, str, None]:
     def init(_):
         return (22, None)
 
@@ -42,14 +42,14 @@ def subscription(_):
 
 def test_runtime_setup():
     runtime = counter_runtime()
-    runtime.setup({}, commit=MockCommit())
+    runtime.setup(None, commit=MockCommit())
     assert runtime.model == 22
 
 
 def test_runtime_submit():
     # RunTime defaults to a start point of 22.
     runtime = counter_runtime()
-    runtime.setup({}, commit=MockCommit())
+    runtime.setup(None, commit=MockCommit())
     assert runtime.model == 22
 
     # Two incr commands increase the count by 2.
@@ -72,7 +72,7 @@ def test_runtime_submit():
 def test_runtime_publish():
     # RunTime defaults to a start point of 22.
     runtime = counter_runtime()
-    runtime.setup({}, commit=MockCommit())
+    runtime.setup(None, commit=MockCommit())
     assert runtime.model == 22
 
     # A join event updates the model and sets it to -10.

From 0346a3c96b57d0c06dbaea582bf5280ea484b72e Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 15:11:57 -0600
Subject: [PATCH 39/49] Docstrings

---
 .../replays/consumers/buffered/consumer.py    | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index f554538fa189fc..040298ee917cba 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,4 +1,15 @@
-"""Session Replay recording consumer implementation."""
+"""Session Replay recording consumer implementation.
+
+This module has two parts. A processing component and a buffer flushing component. The processing
+component is straight-forward. It accepts a message and performs some work on it. After it
+completes it instructs the runtime to append the message to the buffer. This is abstracted by the
+buffering runtime library so we just return the transformed data in this module.
+
+The second part is the flushing of the buffer. The buffering runtime library has no idea when to
+flush this buffer so it constantly asks us if it can flush. We control flushing behavior through a
+stateful "BufferManager" class.  If we can_flush then we do_flush. After the flush completes the
+RunTime will commit the offsets.
+"""
 
 import contextlib
 import time
@@ -28,18 +39,9 @@ class Flags(TypedDict):
 class BufferManager:
     """Buffer manager.
 
-    Determines if and when a buffer should flush. The buffer accepts only one argument (`flags`)
-    which can contain any number of configuration options. Currently we only care about the time
-    the buffer has been active and the length of the buffer. But we could update this function to
-    extract an option thats concerned with the bytesize of the buffer if we thought that was a
-    useful metric for committing.
-
     The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its
     methods as callbacks to the Model. The state contained within the method's instance is implicit
-    and unknown to the RunTime. We could model this state inside the RunTime but the state is
-    simple enough that I don't feel the need to over-engineer the buffering RunTime. For more
-    complex use-cases you would want to formalize state transformations in the RunTime. Especially
-    if you wanted to expose the state across more locations in the application.
+    and unknown to the RunTime.
     """
 
     def __init__(self, flags: Flags) -> None:
@@ -50,6 +52,8 @@ def __init__(self, flags: Flags) -> None:
         self.__last_flushed_at = time.time()
 
     def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
+        # TODO: time.time is stateful and hard to test. We should enable the RunTime to perform
+        #       managed effects so we can properly test this behavior.
         return (
             len(model.buffer) >= self.__max_buffer_length
             or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at
@@ -58,8 +62,7 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
     def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
         with sentry_tracing("replays.consumers.buffered.flush_buffer"):
             flush_buffer(model, max_workers=self.__max_workers)
-            # Update the buffer manager with the new time so we don't continuously commit in a
-            # loop!
+            # TODO: time.time again. Should be declarative for testing purposes.
             self.__last_flushed_at = time.time()
 
 
@@ -78,6 +81,8 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> N
         for future in done:
             exc = future.exception()
             if exc is not None:
+                # TODO: Why raise? Can I do something more meaningful here than reject the whole
+                #       batch? Raising is certainly the easiest way of handling failures...
                 raise exc
 
     # Recording metadata is not tracked in the threadpool. This is because this function will

From 4ab0ff25dc11201d828e196edee24d92bb1cbb26 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 15:16:47 -0600
Subject: [PATCH 40/49] More docstrings

---
 src/sentry/replays/consumers/buffered/consumer.py | 3 +++
 src/sentry/replays/consumers/buffered/lib.py      | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 040298ee917cba..35a1b450ef90e4 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,5 +1,8 @@
 """Session Replay recording consumer implementation.
 
+To understand how the buffering works visit the `lib.py` module and inspect the source of the
+buffering runtime.
+
 This module has two parts. A processing component and a buffer flushing component. The processing
 component is straight-forward. It accepts a message and performs some work on it. After it
 completes it instructs the runtime to append the message to the buffer. This is abstracted by the
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
index fa6d7f4b0dded8..9945c1253407ed 100644
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ b/src/sentry/replays/consumers/buffered/lib.py
@@ -16,7 +16,6 @@
     Commit,
     Flags,
     Join,
-    Nothing,
     Poll,
     RunTime,
     Sub,
@@ -45,6 +44,8 @@ class Append(Generic[Item]):
 
 @dataclass(frozen=True)
 class AppendOffset:
+    """Update the offsets; no item needs to be appended to the buffer."""
+
     offset: MutableMapping[Partition, int]
 
 
@@ -67,6 +68,8 @@ class Polled:
 
 
 class TryFlush:
+    """Instruct the application to flush the buffer if its time."""
+
     pass
 
 
@@ -91,7 +94,7 @@ def init(
     init_fn: Callable[[Flags], Model[Item]],
     flags: Flags,
 ) -> tuple[Model[Item], Cmd[Msg[Item]]]:
-    return (init_fn(flags), Nothing())
+    return (init_fn(flags), None)
 
 
 def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Item]] | None]:

From 77c2971fd6f57ee6899db2425fc7bc26e676809b Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Fri, 28 Feb 2025 15:19:16 -0600
Subject: [PATCH 41/49] Yet more docstrings

---
 src/sentry/replays/consumers/buffered/factory.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py
index 036ee4868342ad..2d90798f1ee210 100644
--- a/src/sentry/replays/consumers/buffered/factory.py
+++ b/src/sentry/replays/consumers/buffered/factory.py
@@ -1,3 +1,9 @@
+"""Session Replay recording consumer strategy factory.
+
+This module exists solely to abstract the bootstraping process of the application and runtime in
+`sentry/consumers/__init__.py`.
+"""
+
 from collections.abc import Mapping
 
 from arroyo.backends.kafka.consumer import KafkaPayload

From b956ad9f42f1378ad54bdcd417f2c3e30f99e3c6 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 08:06:38 -0600
Subject: [PATCH 42/49] Implement declarative effect management

---
 .../replays/consumers/buffered/consumer.py    | 241 ++++++++-----
 .../replays/consumers/buffered/factory.py     |   4 +-
 src/sentry/replays/consumers/buffered/lib.py  | 143 --------
 .../replays/consumers/buffered/platform.py    | 102 ++++--
 .../unit/consumers/test_buffering_runtime.py  | 186 ----------
 .../replays/unit/consumers/test_helpers.py    |  22 +-
 .../replays/unit/consumers/test_recording.py  | 336 +++++++++++++++---
 7 files changed, 528 insertions(+), 506 deletions(-)
 delete mode 100644 src/sentry/replays/consumers/buffered/lib.py
 delete mode 100644 tests/sentry/replays/unit/consumers/test_buffering_runtime.py

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 35a1b450ef90e4..fe5448563e6155 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,37 +1,37 @@
-"""Session Replay recording consumer implementation.
-
-To understand how the buffering works visit the `lib.py` module and inspect the source of the
-buffering runtime.
-
-This module has two parts. A processing component and a buffer flushing component. The processing
-component is straight-forward. It accepts a message and performs some work on it. After it
-completes it instructs the runtime to append the message to the buffer. This is abstracted by the
-buffering runtime library so we just return the transformed data in this module.
-
-The second part is the flushing of the buffer. The buffering runtime library has no idea when to
-flush this buffer so it constantly asks us if it can flush. We control flushing behavior through a
-stateful "BufferManager" class.  If we can_flush then we do_flush. After the flush completes the
-RunTime will commit the offsets.
-"""
+"""Session Replay recording consumer implementation."""
 
 import contextlib
 import time
+from collections.abc import MutableMapping
 from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
+from dataclasses import dataclass
 from typing import TypedDict
 
 import sentry_sdk
-
-from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
+from arroyo.types import Partition
+
+from sentry.replays.consumers.buffered.platform import (
+    Cmd,
+    Commit,
+    Effect,
+    Join,
+    Nothing,
+    Poll,
+    RunTime,
+    Sub,
+    Task,
+)
 from sentry.replays.usecases.ingest import (
     DropSilently,
     ProcessedRecordingMessage,
     commit_recording_message,
     parse_recording_message,
     process_recording_message,
-    sentry_tracing,
     track_recording_metadata,
 )
 
+# Types.
+
 
 class Flags(TypedDict):
     max_buffer_length: int
@@ -39,90 +39,163 @@ class Flags(TypedDict):
     max_workers: int
 
 
-class BufferManager:
-    """Buffer manager.
+@dataclass
+class Model:
+    buffer: list[ProcessedRecordingMessage]
+    last_flushed_at: float
+    max_buffer_length: int
+    max_buffer_wait: int
+    max_workers: int
+    offsets: MutableMapping[Partition, int]
+
+
+@dataclass(frozen=True)
+class Append:
+    """Append the item to the buffer and update the offsets."""
 
-    The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its
-    methods as callbacks to the Model. The state contained within the method's instance is implicit
-    and unknown to the RunTime.
-    """
+    item: ProcessedRecordingMessage
+    offset: MutableMapping[Partition, int]
 
-    def __init__(self, flags: Flags) -> None:
-        self.__max_buffer_length = flags["max_buffer_length"]
-        self.__max_buffer_wait = flags["max_buffer_wait"]
-        self.__max_workers = flags["max_workers"]
 
-        self.__last_flushed_at = time.time()
+@dataclass(frozen=True)
+class AppendOffset:
+    """Update the offsets; no item needs to be appended to the buffer."""
 
-    def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
-        # TODO: time.time is stateful and hard to test. We should enable the RunTime to perform
-        #       managed effects so we can properly test this behavior.
-        return (
-            len(model.buffer) >= self.__max_buffer_length
-            or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at
-        )
+    offset: MutableMapping[Partition, int]
 
-    def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
-        with sentry_tracing("replays.consumers.buffered.flush_buffer"):
-            flush_buffer(model, max_workers=self.__max_workers)
-            # TODO: time.time again. Should be declarative for testing purposes.
-            self.__last_flushed_at = time.time()
 
+class Committed:
+    """The platform committed offsets. Our buffer is now completely done."""
 
-@sentry_sdk.trace
-def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None:
-    if len(model.buffer) == 0:
-        return None
 
-    with ThreadPoolExecutor(max_workers=max_workers) as pool:
-        futures = [pool.submit(flush_message, message) for message in model.buffer]
+class Flush:
+    """Our application hit the flush threshold and has been instructed to flush."""
 
-        # Tasks can fail. We check the done set for any failures. We will wait for all the
-        # futures to complete before running this step or eagerly run this step if any task
-        # errors.
-        done, _ = wait(futures, return_when=FIRST_EXCEPTION)
-        for future in done:
-            exc = future.exception()
-            if exc is not None:
-                # TODO: Why raise? Can I do something more meaningful here than reject the whole
-                #       batch? Raising is certainly the easiest way of handling failures...
-                raise exc
 
-    # Recording metadata is not tracked in the threadpool. This is because this function will
-    # log. Logging will acquire a lock and make our threading less useful due to the speed of
-    # the I/O we do in this step.
-    for message in model.buffer:
-        track_recording_metadata(message)
+@dataclass(frozen=True)
+class Flushed:
+    """Our application successfully flushed."""
 
-    return None
+    now: float
 
 
-@sentry_sdk.trace
-def flush_message(message: ProcessedRecordingMessage) -> None:
-    with contextlib.suppress(DropSilently):
-        commit_recording_message(message)
+@dataclass(frozen=True)
+class TryFlush:
+    """Instruct the application to flush the buffer if its time."""
+
+    now: float
 
 
-def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None:
-    """Message processing function.
+# A "Msg" is the union of all application messages our RunTime will accept.
+Msg = Append | AppendOffset | Committed | Flush | Flushed | TryFlush
 
-    Accepts an unstructured type and returns a structured one. Other than tracing the goal is to
-    have no I/O here. We'll commit the I/O on flush.
-    """
-    with sentry_tracing("replays.consumers.buffered.process_message"):
-        with contextlib.suppress(DropSilently):
-            message = parse_recording_message(message_bytes)
-            return process_recording_message(message)
-        return None
 
+# State machine functions.
 
-def init(flags: Flags) -> Model[ProcessedRecordingMessage]:
-    """Return the initial state of the application."""
-    buffer = BufferManager(flags)
-    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
 
+def init(flags: Flags) -> tuple[Model, Cmd[Msg] | None]:
+    return (
+        Model(
+            buffer=[],
+            last_flushed_at=time.time(),
+            max_buffer_wait=flags["max_buffer_wait"],
+            max_workers=flags["max_workers"],
+            max_buffer_length=flags["max_buffer_length"],
+            offsets={},
+        ),
+        None,
+    )
 
-recording_runtime = buffering_runtime(
-    init_fn=init,
-    process_fn=process_message,
+
+@sentry_sdk.trace
+def process(_: Model, message: bytes, offset: MutableMapping[Partition, int]) -> Msg | None:
+    try:
+        item = process_recording_message(parse_recording_message(message))
+        return Append(item=item, offset=offset)
+    except Exception:
+        return AppendOffset(offset=offset)
+
+
+def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg] | None]:
+    match msg:
+        case Append(item=item, offset=offset):
+            model.buffer.append(item)
+            model.offsets.update(offset)
+            return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now)))
+        case AppendOffset(offset=offset):
+            model.offsets.update(offset)
+            return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now)))
+        case Committed():
+            return (model, None)
+        case Flush():
+            return (model, Effect(fun=FlushBuffer(model), msg=lambda now: Flushed(now=now)))
+        case Flushed(now=now):
+            model.buffer = []
+            model.last_flushed_at = now
+            return (model, Commit(msg=Committed(), offsets=model.offsets))
+        case TryFlush(now=now):
+            return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing())
+
+
+def subscription(model: Model) -> list[Sub[Msg]]:
+    return [
+        Join(msg=Flush),
+        Poll(msg=lambda: TryFlush(now=time.time())),
+    ]
+
+
+# Helpers.
+
+
+def can_flush(model: Model, now: float) -> bool:
+    return (
+        len(model.buffer) >= model.max_buffer_length
+        or (now - model.max_buffer_wait) >= model.last_flushed_at
+    )
+
+
+@dataclass(frozen=True)
+class FlushBuffer:
+    model: Model
+
+    def __call__(self) -> float:
+        @sentry_sdk.trace
+        def flush_message(message: ProcessedRecordingMessage) -> None:
+            with contextlib.suppress(DropSilently):
+                commit_recording_message(message)
+
+        if len(self.model.buffer) == 0:
+            return time.time()
+
+        with ThreadPoolExecutor(max_workers=self.model.max_workers) as pool:
+            futures = [pool.submit(flush_message, message) for message in self.model.buffer]
+
+            # Tasks can fail. We check the done set for any failures. We will wait for all the
+            # futures to complete before running this step or eagerly run this step if any task
+            # errors.
+            done, _ = wait(futures, return_when=FIRST_EXCEPTION)
+            for future in done:
+                exc = future.exception()
+                # Raising preserves the existing behavior. We can address error handling in a
+                # follow up.
+                if exc is not None and not isinstance(exc, DropSilently):
+                    raise exc
+
+        # Recording metadata is not tracked in the threadpool. This is because this function will
+        # log. Logging will acquire a lock and make our threading less useful due to the speed of
+        # the I/O we do in this step.
+        for message in self.model.buffer:
+            track_recording_metadata(message)
+
+        return time.time()
+
+
+# Consumer.
+
+
+recording_consumer = RunTime(
+    init=init,
+    process=process,
+    subscription=subscription,
+    update=update,
 )
diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py
index 2d90798f1ee210..56566d449d2053 100644
--- a/src/sentry/replays/consumers/buffered/factory.py
+++ b/src/sentry/replays/consumers/buffered/factory.py
@@ -11,7 +11,7 @@
 from arroyo.types import Commit as ArroyoCommit
 from arroyo.types import Partition
 
-from sentry.replays.consumers.buffered.consumer import Flags, recording_runtime
+from sentry.replays.consumers.buffered.consumer import Flags, recording_consumer
 from sentry.replays.consumers.buffered.platform import PlatformStrategy
 
 
@@ -29,4 +29,4 @@ def create_with_partitions(
         commit: ArroyoCommit,
         partitions: Mapping[Partition, int],
     ) -> ProcessingStrategy[KafkaPayload]:
-        return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_runtime)
+        return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_consumer)
diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py
deleted file mode 100644
index 9945c1253407ed..00000000000000
--- a/src/sentry/replays/consumers/buffered/lib.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""Buffered RunTime implementation.
-
-The Buffering RunTime eagerly processes messages as they are received and waits until its buffer
-is full before flushing those messages. The goal being to achieve efficiencies from batched I/O.
-"""
-
-from collections.abc import Callable, MutableMapping
-from dataclasses import dataclass
-from functools import partial
-from typing import Generic, TypeVar
-
-from arroyo.types import Partition
-
-from sentry.replays.consumers.buffered.platform import (
-    Cmd,
-    Commit,
-    Flags,
-    Join,
-    Poll,
-    RunTime,
-    Sub,
-    Task,
-)
-
-Item = TypeVar("Item")
-T = TypeVar("T")
-
-
-@dataclass
-class Model(Generic[Item]):
-    buffer: list[Item]
-    can_flush: Callable[["Model[Item]"], bool]
-    do_flush: Callable[["Model[Item]"], None]
-    offsets: MutableMapping[Partition, int]
-
-
-@dataclass(frozen=True)
-class Append(Generic[Item]):
-    """Append the item to the buffer and update the offsets."""
-
-    item: Item
-    offset: MutableMapping[Partition, int]
-
-
-@dataclass(frozen=True)
-class AppendOffset:
-    """Update the offsets; no item needs to be appended to the buffer."""
-
-    offset: MutableMapping[Partition, int]
-
-
-class Committed:
-    """The platform committed offsets. Our buffer is now completely done."""
-
-    pass
-
-
-class Flush:
-    """Our application hit the flush threshold and has been instructed to flush."""
-
-    pass
-
-
-class Polled:
-    """Our application was polled by the platform."""
-
-    pass
-
-
-class TryFlush:
-    """Instruct the application to flush the buffer if its time."""
-
-    pass
-
-
-# A "Msg" is the union of all application messages our RunTime will accept.
-Msg = Append[Item] | AppendOffset | Committed | Flush | Polled | TryFlush
-
-
-def process(
-    process_fn: Callable[[bytes], Item | None],
-    model: Model[Item],
-    message: bytes,
-    offset: MutableMapping[Partition, int],
-) -> Msg[Item]:
-    item = process_fn(message)
-    if item:
-        return Append(item=item, offset=offset)
-    else:
-        return AppendOffset(offset=offset)
-
-
-def init(
-    init_fn: Callable[[Flags], Model[Item]],
-    flags: Flags,
-) -> tuple[Model[Item], Cmd[Msg[Item]]]:
-    return (init_fn(flags), None)
-
-
-def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Item]] | None]:
-    match msg:
-        case Append(item=item, offset=offset):
-            model.buffer.append(item)
-            model.offsets.update(offset)
-            return (model, Task(msg=TryFlush()))
-        case AppendOffset(offset=offset):
-            model.offsets.update(offset)
-            return (model, Task(msg=TryFlush()))
-        case Committed():
-            return (model, None)
-        case Flush():
-            model.do_flush(model)
-            model.buffer = []
-            return (model, Commit(msg=Committed(), offsets=model.offsets))
-        case Polled():
-            if model.can_flush(model):
-                return (model, Task(msg=Flush()))
-            else:
-                return (model, None)
-        case TryFlush():
-            if model.can_flush(model):
-                return (model, Task(msg=Flush()))
-            else:
-                return (model, None)
-
-
-def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]:
-    return [
-        Join(msg=Flush()),
-        Poll(msg=Polled()),
-    ]
-
-
-def buffering_runtime(
-    init_fn: Callable[[Flags], Model[Item]],
-    process_fn: Callable[[bytes], Item | None],
-) -> RunTime[Model[Item], Msg[Item], Flags]:
-    return RunTime(
-        init=partial(init, init_fn),
-        process=partial(process, process_fn),
-        subscription=subscription,
-        update=update,
-    )
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index f1819d6ee7fe7c..6e92252e5f684a 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -1,7 +1,7 @@
-from collections.abc import Callable, Mapping, MutableMapping
+from collections.abc import Callable, MutableMapping
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Generic, TypeVar
+from typing import Any, Generic, TypeVar, cast
 
 from arroyo.backends.kafka.consumer import KafkaPayload
 from arroyo.processing.strategies import CommitOffsets, ProcessingStrategy
@@ -25,7 +25,7 @@ def __init__(
         self.__closed = False
         self.__runtime = runtime
 
-    def handle_commit_request(self, offsets: Mapping[Partition, int]) -> None:
+    def handle_commit_request(self, offsets: MutableMapping[Partition, int]) -> None:
         self.__commit_step.submit(Message(Value(None, offsets, datetime.now())))
 
     def submit(self, message: Message[KafkaPayload]) -> None:
@@ -51,7 +51,7 @@ def join(self, timeout: float | None = None) -> None:
         self.__commit_step.join(timeout)
 
 
-CommitProtocol = Callable[[Mapping[Partition, int]], None]
+CommitProtocol = Callable[[MutableMapping[Partition, int]], None]
 
 # A Model represents the state of your application. It is a type variable and the RunTime is
 # generic over it. Your state can be anything from a simple integer to a large class with many
@@ -62,8 +62,12 @@ def join(self, timeout: float | None = None) -> None:
 # and optionally issue commands to the RunTime.
 Msg = TypeVar("Msg")
 
+# A generic type representing the structure of the flags passed to the RunTime instance.
 Flags = TypeVar("Flags")
 
+# A generic type of unknown origins not tied to anything in the platform.
+T = TypeVar("T")
+
 
 @dataclass(frozen=True)
 class Commit(Generic[Msg]):
@@ -78,10 +82,22 @@ class Commit(Generic[Msg]):
     offsets: MutableMapping[Partition, int]
 
 
-class Nothing:
-    """Instructs the RunTime to do nothing. Equivalent to a null command."""
+@dataclass(frozen=True)
+class Effect(Generic[Msg]):
+    """Instructs the RunTime to perform a managed effect.
 
-    pass
+    If the RunTime performs an effect for the application it means the RunTime can dictate if the
+    effect blocks the application, if the effect executes at all, or perform any additional
+    operations before or after the effect. This has significant implications for RunTime
+    performance and application testability.
+    """
+
+    fun: Callable[[], Any]
+    msg: Callable[[Any], Msg]
+
+
+class Nothing:
+    """Instructs the RunTime to do nothing."""
 
 
 @dataclass(frozen=True)
@@ -94,7 +110,7 @@ class Task(Generic[Msg]):
 # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime
 # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not
 # be registered on a per application basis.
-Cmd = Commit[Msg] | Nothing | Task[Msg]
+Cmd = Commit[Msg] | Effect[Msg] | Task[Msg]
 
 
 @dataclass(frozen=True)
@@ -106,7 +122,7 @@ class Join(Generic[Msg]):
     events and handle them in the way they see fit.
     """
 
-    msg: Msg
+    msg: Callable[[], Msg]
     name = "join"
 
 
@@ -118,7 +134,7 @@ class Poll(Generic[Msg]):
     these events and choose to act on them.
     """
 
-    msg: Msg
+    msg: Callable[[], Msg]
     name = "poll"
 
 
@@ -142,7 +158,7 @@ class RunTime(Generic[Model, Msg, Flags]):
     def __init__(
         self,
         init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]],
-        process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None],
+        process: Callable[[Model, bytes, MutableMapping[Partition, int]], Msg | None],
         subscription: Callable[[Model], list[Sub[Msg]]],
         update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]],
     ) -> None:
@@ -151,41 +167,47 @@ def __init__(
         self.subscription = subscription
         self.update = update
 
-        self.__commit: CommitProtocol | None = None
-        self.__model: Model | None = None
-        self.__subscriptions: dict[str, Sub[Msg]] = {}
+        self._commit: CommitProtocol | None = None
+        self._model: Model | None = None
+        self._subscriptions: dict[str, Sub[Msg]] = {}
 
     @property
     def commit(self) -> CommitProtocol:
-        assert self.__commit is not None
-        return self.__commit
+        assert self._commit is not None
+        return self._commit
 
     @property
     def model(self) -> Model:
-        assert self.__model is not None
-        return self.__model
+        assert self._model is not None
+        return self._model
 
+    # NOTE: Could this be a factory function that produces RunTimes instead? That way we don't need
+    # the assert checks on model and commit.
     def setup(self, flags: Flags, commit: CommitProtocol) -> None:
-        # NOTE: Could this be a factory function that produces RunTimes instead? That way we
-        #       don't need the assert checks on model and commit.
-        self.__commit = commit
+        self._commit = commit
 
         model, cmd = self.init(flags)
-        self.__model = model
-        self.__handle_cmd(cmd)
-        self.__register_subscriptions()
+        self._model = model
+        self._handle_cmd(cmd)
+        self._register_subscriptions()
 
     def submit(self, message: Message[KafkaPayload]) -> None:
-        self.__handle_msg(self.process(self.model, message.payload.value, message.committable))
-
-    def publish(self, sub_name: str) -> None:
+        self._handle_msg(
+            self.process(
+                self.model,
+                message.payload.value,
+                cast(MutableMapping[Partition, int], message.committable),
+            )
+        )
+
+    def publish(self, sub_name: str):
         # For each new subscription event we re-register the subscribers in case anything within
         # the application has changed. I.e. the model is in some new state and that means we care
         # about a new subscription or don't care about an old one.
-        self.__register_subscriptions()
+        self._register_subscriptions()
 
         # Using the subscription's name look for the subscription in the registry.
-        sub = self.__subscriptions.get(sub_name)
+        sub = self._subscriptions.get(sub_name)
         if sub is None:
             return None
 
@@ -194,29 +216,31 @@ def publish(self, sub_name: str) -> None:
         # subscriptions might do more complex things.
         match sub:
             case Join(msg=msg):
-                return self.__handle_msg(msg)
+                return self._handle_msg(msg())
             case Poll(msg=msg):
-                return self.__handle_msg(msg)
+                return self._handle_msg(msg())
 
-    def __handle_msg(self, msg: Msg | None) -> None:
+    def _handle_msg(self, msg: Msg | None) -> None:
         if msg:
             model, cmd = self.update(self.model, msg)
-            self.__model = model
-            self.__handle_cmd(cmd)
+            self._model = model
+            self._handle_cmd(cmd)
 
-    def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
+    def _handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
         if cmd is None:
             return None
 
         match cmd:
             case Commit(msg=msg, offsets=offsets):
                 self.commit(offsets)
-                return self.__handle_msg(msg)
+                return self._handle_msg(msg)
+            case Effect(msg=msg, fun=fun):
+                return self._handle_msg(msg(fun()))
             case Nothing():
                 return None
             case Task(msg=msg):
-                return self.__handle_msg(msg)
+                return self._handle_msg(msg)
 
-    def __register_subscriptions(self) -> None:
+    def _register_subscriptions(self) -> None:
         for sub in self.subscription(self.model):
-            self.__subscriptions[sub.name] = sub
+            self._subscriptions[sub.name] = sub
diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
deleted file mode 100644
index ce65f9ed076318..00000000000000
--- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import pytest
-from arroyo.types import Partition, Topic
-
-from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
-from tests.sentry.replays.unit.consumers.test_helpers import (
-    MockCommit,
-    MockSink,
-    make_kafka_message,
-)
-
-
-def buffer_runtime(sink):
-    return buffering_runtime(
-        init_fn=lambda _: Model(
-            # The buffer is an arbitrary list of items but it could be anything. For example a
-            # string which you could perpetually append to.
-            buffer=[],
-            # Commit when the buffer has 5 messages in it.
-            can_flush=lambda m: len(m.buffer) == 5,
-            # Flush in this case is just pushing the buffer into a mock class which holds the
-            # messages in memory. This is easy for us to test and we don't need to mock out any
-            # service code which is both unknown to the RunTime and can be tested independently.
-            do_flush=lambda m: sink.accept(m.buffer),
-            # This is the offsets tracking object. The RunTime manages appending.
-            offsets={},
-        ),
-        # Our process function is a simple lambda which checks that the message is a valid
-        # integer and is less than 5. There's no exception handling here. That could crash the
-        # consumer! Its up to the application developer to manage.
-        process_fn=lambda m: int(m) if int(m) < 5 else None,
-    )
-
-
-def test_runtime_setup():
-    """Test RunTime initialization."""
-    runtime = buffer_runtime(MockSink())
-    runtime.setup({}, commit=MockCommit())
-    assert runtime.model.buffer == []
-    assert runtime.model.offsets == {}
-
-
-def test_buffering_runtime_submit():
-    """Test message recived from the platform.
-
-    Messages are buffered up until a point (see the RunTime definition at the top of the module).
-    When we cross the threshold the RunTime should order a buffer flush action.
-    """
-    mock_commit = MockCommit()
-    sink = MockSink()
-
-    runtime = buffer_runtime(sink)
-    runtime.setup({}, mock_commit)
-    assert runtime.model.buffer == []
-    assert runtime.model.offsets == {}
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert three valid messages buffered.
-    runtime.submit(make_kafka_message(b"1"))
-    runtime.submit(make_kafka_message(b"1"))
-    runtime.submit(make_kafka_message(b"1"))
-    assert runtime.model.buffer == [1, 1, 1]
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert two invalid messages not buffered.
-    runtime.submit(make_kafka_message(b"5"))
-    runtime.submit(make_kafka_message(b"5"))
-    assert runtime.model.buffer == [1, 1, 1]
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert an additional message is buffered.
-    runtime.submit(make_kafka_message(b"2"))
-    assert runtime.model.buffer == [1, 1, 1, 2]
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert the buffer is flushed to the sink.
-    runtime.submit(make_kafka_message(b"3"))
-    assert runtime.model.buffer == []
-    assert sink.accepted == [1, 1, 1, 2, 3]
-    assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
-
-
-def test_buffering_runtime_submit_invalid_message():
-    """Test invalid message recived from the platform."""
-    mock_commit = MockCommit()
-    sink = MockSink()
-
-    runtime = buffer_runtime(sink)
-    runtime.setup({}, mock_commit)
-    assert runtime.model.buffer == []
-    assert runtime.model.offsets == {}
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert exceptional behavior is not handled.
-    with pytest.raises(ValueError):
-        runtime.submit(make_kafka_message(b"hello"))
-
-
-def test_buffering_runtime_join():
-    """Test the RunTime received a join message from the platform.
-
-    When the consumer is ordered to shutdown a "join" message will be submitted to the RunTime.
-    When that happens we rush to flush the buffer and commit whatever we can.
-    """
-    mock_commit = MockCommit()
-    sink = MockSink()
-
-    runtime = buffer_runtime(sink)
-    runtime.setup({}, mock_commit)
-    assert runtime.model.buffer == []
-    assert runtime.model.offsets == {}
-    assert sink.accepted == []
-
-    # Assert three valid messages buffered.
-    runtime.submit(make_kafka_message(b"1"))
-    runtime.submit(make_kafka_message(b"1"))
-    runtime.submit(make_kafka_message(b"1"))
-    assert runtime.model.buffer == [1, 1, 1]
-    assert sink.accepted == []
-
-    # Assert join subscriptions eagerly flush the buffer.
-    runtime.publish("join")
-    assert runtime.model.buffer == []
-    assert sink.accepted == [1, 1, 1]
-    assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
-
-
-def test_buffering_runtime_poll():
-    """Test the RunTime received a poll message from the platform.
-
-    This will happen periodically. In the event the consumer does not receive messages for some
-    interval the poll message will be published will be called.
-    """
-    mock_commit = MockCommit()
-    sink = MockSink()
-
-    runtime = buffer_runtime(sink)
-    runtime.setup({}, mock_commit)
-    assert runtime.model.buffer == []
-    assert runtime.model.offsets == {}
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert poll does not automatically empty the buffer.
-    runtime.submit(make_kafka_message(b"1"))
-    runtime.publish("poll")
-    assert runtime.model.buffer == [1]
-    assert sink.accepted == []
-
-    # Dirty mutation warning! We're forcing ourselves to be in a flushable state by mutating the
-    # state outside the explicit state machine interfaces.
-    runtime.model.buffer = [1, 2, 3, 4, 5]
-    assert runtime.model.buffer == [1, 2, 3, 4, 5]
-    assert sink.accepted == []
-
-    # Assert poll flushes if the buffer is ready.
-    runtime.publish("poll")
-    assert runtime.model.buffer == []
-    assert sink.accepted == [1, 2, 3, 4, 5]
-    assert mock_commit.commit == {Partition(Topic("a"), 1): 2}
-
-
-def test_buffering_runtime_committed_offsets():
-    """Test offset commit."""
-    mock_commit = MockCommit()
-    sink = MockSink()
-
-    runtime = buffer_runtime(sink)
-    runtime.setup({}, mock_commit)
-    assert runtime.model.buffer == []
-    assert runtime.model.offsets == {}
-    assert sink.accepted == []
-    assert mock_commit.commit == {}
-
-    # Assert committed offsets are correct.
-    runtime.submit(make_kafka_message(b"1"))
-    runtime.submit(make_kafka_message(b"10", topic="b"))
-    runtime.submit(make_kafka_message(b"100", offset=2))
-    assert runtime.model.buffer == [1]
-    assert runtime.model.offsets == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2}
-    runtime.publish("join")
-    assert mock_commit.commit == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2}
diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py
index 3543885e55b07b..79663b114e78ce 100644
--- a/tests/sentry/replays/unit/consumers/test_helpers.py
+++ b/tests/sentry/replays/unit/consumers/test_helpers.py
@@ -1,9 +1,12 @@
-from collections.abc import Mapping
+from collections.abc import Mapping, MutableMapping
 from datetime import datetime
+from typing import cast
 
 from arroyo.backends.kafka import KafkaPayload
 from arroyo.types import BrokerValue, Message, Partition, Topic
 
+from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, RunTime
+
 
 class MockCommit:
     def __init__(self):
@@ -13,6 +16,23 @@ def __call__(self, offsets: Mapping[Partition, int], force: bool = False) -> Non
         self.commit.update(offsets)
 
 
+class MockRunTime(RunTime[Model, Msg, Flags]):
+    def _handle_msg(self, msg):
+        while True:
+            model, cmd = self.update(self.model, msg)
+            self._model = model
+            msg = yield cmd
+
+    def submit(self, message):
+        yield from self._handle_msg(
+            self.process(
+                self.model,
+                message.payload.value,
+                cast(MutableMapping[Partition, int], message.committable),
+            )
+        )
+
+
 class MockSink:
     def __init__(self):
         self.accepted = []
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index fbbb9f50839a2c..b4ad74b8b2c062 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -2,76 +2,150 @@
 import uuid
 import zlib
 
-import msgpack
 from arroyo.types import Partition
 from arroyo.types import Topic as ArroyoTopic
 from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
 from sentry.conf.types.kafka_definition import Topic, get_topic_codec
-from sentry.replays.consumers.buffered.consumer import process_message, recording_runtime
+from sentry.replays.consumers.buffered.consumer import (
+    Committed,
+    Flush,
+    FlushBuffer,
+    TryFlush,
+    init,
+    process,
+    subscription,
+    update,
+)
+from sentry.replays.consumers.buffered.platform import Commit, Effect, Nothing, Task
 from sentry.replays.usecases.ingest import ProcessedRecordingMessage
 from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
-from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
+from tests.sentry.replays.unit.consumers.test_helpers import (
+    MockCommit,
+    MockRunTime,
+    make_kafka_message,
+)
 
 RECORDINGS_CODEC = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
 
 
-def test_process_message():
-    result = process_message(
-        msgpack.packb(
-            {
-                "type": "replay_recording_not_chunked",
-                "replay_id": uuid.uuid4().hex,
-                "org_id": 1,
-                "key_id": 3,
-                "project_id": 2,
-                "received": int(time.time()),
-                "retention_days": 30,
-                "payload": b'{"segment_id":0}\n[]',
-                "replay_event": None,
-                "replay_video": None,
-            }
+def test_end_to_end_message_processing():
+    """End to end test of the recording consumer."""
+    runtime = _make_runtime()
+
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": 1,
+        "payload": b'{"segment_id":0}\n[]',
+        "project_id": 1,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
+    message_bytes = RECORDINGS_CODEC.encode(message)
+    kafka_message = make_kafka_message(message_bytes)
+
+    gen = runtime.submit(kafka_message)
+
+    # Assert the runtime gets the current time after appending the message and then attempts to
+    # flush the buffer with the current time.
+    cmd = next(gen)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == time.time
+    assert cmd.msg(1) == TryFlush(now=1)
+    assert runtime.model.buffer == [
+        ProcessedRecordingMessage(
+            actions_event=ParsedEventMeta([], [], [], [], [], []),
+            filedata=zlib.compress(b"[]"),
+            filename=runtime.model.buffer[0].filename,
+            is_replay_video=False,
+            key_id=None,
+            org_id=1,
+            project_id=1,
+            received=message["received"],
+            recording_size_uncompressed=2,
+            recording_size=runtime.model.buffer[0].recording_size,
+            retention_days=30,
+            replay_id=message["replay_id"],
+            segment_id=0,
+            video_size=None,
+            replay_event=None,
         )
-    )
+    ]
 
-    expected_event_metadata = ParsedEventMeta([], [], [], [], [], [])
-    assert result is not None
-    assert result == ProcessedRecordingMessage(
-        actions_event=expected_event_metadata,
-        filedata=zlib.compress(b"[]"),
-        filename=result.filename,
-        is_replay_video=False,
-        key_id=3,
-        org_id=1,
-        project_id=2,
-        received=result.received,
-        recording_size_uncompressed=2,
-        recording_size=result.recording_size,
-        retention_days=30,
-        replay_id=result.replay_id,
-        segment_id=0,
-        video_size=None,
-        replay_event=None,
-    )
+    # Give the runtime timestamps that are too early to flush (including the one the runtime wanted
+    # to generate).
+    assert isinstance(gen.send(TryFlush(now=1)), Nothing)
+    assert isinstance(gen.send(TryFlush(now=2)), Nothing)
+    assert isinstance(gen.send(TryFlush(now=3)), Nothing)
+    assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
 
+    # Give the runtime a timestamp that will trigger a flush.
+    cmd = gen.send(TryFlush(now=time.time() + 1))
+    assert isinstance(cmd, Task)
+    assert isinstance(cmd.msg, Flush)
 
-def test_process_message_invalid():
-    result = process_message(msgpack.packb(b"hello, world!"))
-    assert result is None
+    # Assert the runtime triggered a buffer flush and forward the next msg to the runtime.
+    cmd = gen.send(cmd.msg)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == FlushBuffer(runtime.model)
+    assert len(runtime.model.buffer) == 1
 
+    # Assert the successful flush triggers a commit command. Assert the model's offsets were
+    # committed and the buffer was reset.
+    cmd = gen.send(cmd.msg(1))
+    assert len(runtime.model.buffer) == 0
+    assert runtime.model.last_flushed_at == 1
+    assert isinstance(cmd, Commit)
+    assert isinstance(cmd.msg, Committed)
+    assert cmd.offsets == runtime.model.offsets
+    assert cmd.offsets == {Partition(topic=ArroyoTopic(name="a"), index=1): 2}
 
-def test_commit_invalid_message():
-    """Assert invalid messages have their offsets staged for commit."""
-    mock_commit = MockCommit()
 
-    recording_runtime.setup(
-        {"max_buffer_length": 100, "max_buffer_wait": 100, "max_workers": 1}, mock_commit
-    )
+def test_invalid_message_format():
+    """Test message with invalid message format."""
+    runtime = _make_runtime()
+
+    # We submit a message which can't be parsed and will not be buffered. Flush is not triggered.
+    gen = runtime.submit(make_kafka_message(b"invalid"))
+    cmd = next(gen)
+    assert len(runtime.model.buffer) == 0
+    assert len(runtime.model.offsets) == 1
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == time.time
+    assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
+
+    # Trigger a flush by submitting a time that exceeds the window.
+    cmd = gen.send(TryFlush(now=time.time() + 1))
+    assert isinstance(cmd, Task)
+    assert isinstance(cmd.msg, Flush)
+
+    # Send the flush message the runtime provided and assert that it produces a FlushBuffer
+    # effect.
+    cmd = gen.send(cmd.msg)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == FlushBuffer(runtime.model)
+
+    # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid
+    # message is never revisited.
+    cmd = gen.send(cmd.msg(1))
+    assert runtime.model.last_flushed_at == 1
+    assert len(runtime.model.buffer) == 0
+    assert len(runtime.model.offsets) == 1  # offsets are retained
+
+
+def test_invalid_recording_json():
+    """Test message with invalid recording JSON."""
+    runtime = _make_runtime()
 
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b"invalid",  # type: ignore[typeddict-item]
+        "payload": b'{"segment_id":0}\n[',
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
@@ -81,10 +155,170 @@ def test_commit_invalid_message():
         "type": "replay_recording_not_chunked",
         "version": 1,
     }
+    message_bytes = RECORDINGS_CODEC.encode(message)
+    kafka_message = make_kafka_message(message_bytes)
+
+    # We submit a message which will not be buffered. Flush is not triggered.
+    gen = runtime.submit(kafka_message)
+    cmd = next(gen)
+    assert len(runtime.model.buffer) == 1
+    assert len(runtime.model.offsets) == 1
+    assert isinstance(cmd, Effect)
+    assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
+
+    # Trigger a flush by submitting a time that exceeds the window.
+    cmd = gen.send(TryFlush(now=time.time() + 1))
+    assert isinstance(cmd, Task)
+    assert isinstance(cmd.msg, Flush)
 
+    # Send the flush message the runtime provided and assert that it produces a FlushBuffer
+    # effect.
+    cmd = gen.send(cmd.msg)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == FlushBuffer(runtime.model)
+
+    # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid
+    # message is never revisited.
+    cmd = gen.send(cmd.msg(1))
+    assert runtime.model.last_flushed_at == 1
+    assert len(runtime.model.buffer) == 0
+    assert len(runtime.model.offsets) == 1  # offsets are retained
+
+
+def test_missing_headers():
+    """Test message with missing headers."""
+    runtime = _make_runtime()
+
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": 1,
+        "payload": b"[]",
+        "project_id": 1,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
     message_bytes = RECORDINGS_CODEC.encode(message)
     kafka_message = make_kafka_message(message_bytes)
 
-    recording_runtime.submit(kafka_message)
-    assert recording_runtime.model.offsets == {Partition(ArroyoTopic("a"), 1): 2}
-    assert mock_commit.commit == {}
+    # We submit a message which will not be buffered. Flush is not triggered.
+    gen = runtime.submit(kafka_message)
+    cmd = next(gen)
+    assert len(runtime.model.buffer) == 0
+    assert len(runtime.model.offsets) == 1
+    assert isinstance(cmd, Effect)
+    assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
+
+    # Trigger a flush by submitting a time that exceeds the window.
+    cmd = gen.send(TryFlush(now=time.time() + 1))
+    assert isinstance(cmd, Task)
+    assert isinstance(cmd.msg, Flush)
+
+    # Send the flush message the runtime provided and assert that it produces a FlushBuffer
+    # effect.
+    cmd = gen.send(cmd.msg)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == FlushBuffer(runtime.model)
+
+    # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid
+    # message is never revisited.
+    cmd = gen.send(cmd.msg(1))
+    assert runtime.model.last_flushed_at == 1
+    assert len(runtime.model.buffer) == 0
+    assert len(runtime.model.offsets) == 1  # offsets are retained
+
+
+def test_buffer_full_semantics():
+    runtime = _make_runtime()
+
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": 1,
+        "payload": b'{"segment_id":0}\n[]',
+        "project_id": 1,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
+    message_bytes = RECORDINGS_CODEC.encode(message)
+    kafka_message = make_kafka_message(message_bytes)
+
+    # We submit a message which will be buffered but not flushed.
+    gen = runtime.submit(kafka_message)
+    cmd = next(gen)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == time.time
+
+    # Assert the TryFlush msg produced by the runtime had no effect because the buffer was not full
+    # and the wait interval was not exceeded.
+    assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
+
+    # We submit another message which will be buffered and trigger a flush.
+    gen = runtime.submit(kafka_message)
+    cmd = next(gen)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == time.time
+    assert cmd.msg(1) == TryFlush(now=1)
+
+    # Assert a flush command is triggered from the msg produced by the runtime.
+    cmd = gen.send(cmd.msg(cmd.fun()))
+    assert isinstance(cmd, Task)
+    assert isinstance(cmd.msg, Flush)
+
+
+def test_buffer_timeout():
+    runtime = _make_runtime()
+
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": 1,
+        "payload": b'{"segment_id":0}\n[]',
+        "project_id": 1,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
+    message_bytes = RECORDINGS_CODEC.encode(message)
+    kafka_message = make_kafka_message(message_bytes)
+
+    # We submit a message which will be buffered but not flushed.
+    gen = runtime.submit(kafka_message)
+    cmd = next(gen)
+    assert isinstance(cmd, Effect)
+    assert cmd.fun == time.time
+    assert cmd.msg(1) == TryFlush(now=1)
+
+    # Assert the TryFlush msg produced by the runtime had no effect because the wait interval was
+    # not exceeded.
+    assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
+
+    # Now we emit a new TryFlush message with a timestamp in the future. This triggers the runtime
+    # to flush because its flush interval has been exceeded.
+    cmd = gen.send(TryFlush(now=time.time() + 1))
+    assert isinstance(cmd, Task)
+    assert isinstance(cmd.msg, Flush)
+
+
+def _make_runtime():
+    runtime = MockRunTime(init, process, subscription, update)
+    runtime.setup(
+        {
+            "max_buffer_length": 2,
+            "max_buffer_wait": 1,
+            "max_workers": 1,
+        },
+        MockCommit(),
+    )
+    return runtime

From fa4200412df0995ba90cb85428c12bf0b11c5851 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 08:34:41 -0600
Subject: [PATCH 43/49] Move offset management into the runtime

---
 .../replays/consumers/buffered/consumer.py    | 42 +++++++------------
 .../replays/consumers/buffered/platform.py    | 24 ++++-------
 .../replays/unit/consumers/test_helpers.py    | 11 +----
 .../replays/unit/consumers/test_recording.py  | 16 +------
 4 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index fe5448563e6155..ed5484c408b9cf 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -2,13 +2,11 @@
 
 import contextlib
 import time
-from collections.abc import MutableMapping
 from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
 from dataclasses import dataclass
 from typing import TypedDict
 
 import sentry_sdk
-from arroyo.types import Partition
 
 from sentry.replays.consumers.buffered.platform import (
     Cmd,
@@ -46,22 +44,13 @@ class Model:
     max_buffer_length: int
     max_buffer_wait: int
     max_workers: int
-    offsets: MutableMapping[Partition, int]
 
 
 @dataclass(frozen=True)
 class Append:
-    """Append the item to the buffer and update the offsets."""
+    """Append the item to the buffer."""
 
     item: ProcessedRecordingMessage
-    offset: MutableMapping[Partition, int]
-
-
-@dataclass(frozen=True)
-class AppendOffset:
-    """Update the offsets; no item needs to be appended to the buffer."""
-
-    offset: MutableMapping[Partition, int]
 
 
 class Committed:
@@ -79,6 +68,10 @@ class Flushed:
     now: float
 
 
+class Skip:
+    """Skip the message."""
+
+
 @dataclass(frozen=True)
 class TryFlush:
     """Instruct the application to flush the buffer if its time."""
@@ -87,13 +80,13 @@ class TryFlush:
 
 
 # A "Msg" is the union of all application messages our RunTime will accept.
-Msg = Append | AppendOffset | Committed | Flush | Flushed | TryFlush
+Msg = Append | Committed | Flush | Flushed | TryFlush
 
 
 # State machine functions.
 
 
-def init(flags: Flags) -> tuple[Model, Cmd[Msg] | None]:
+def init(flags: Flags) -> tuple[Model, Cmd[Msg]]:
     return (
         Model(
             buffer=[],
@@ -101,38 +94,35 @@ def init(flags: Flags) -> tuple[Model, Cmd[Msg] | None]:
             max_buffer_wait=flags["max_buffer_wait"],
             max_workers=flags["max_workers"],
             max_buffer_length=flags["max_buffer_length"],
-            offsets={},
         ),
-        None,
+        Nothing(),
     )
 
 
 @sentry_sdk.trace
-def process(_: Model, message: bytes, offset: MutableMapping[Partition, int]) -> Msg | None:
+def process(_: Model, message: bytes) -> Msg:
     try:
         item = process_recording_message(parse_recording_message(message))
-        return Append(item=item, offset=offset)
+        return Append(item=item)
     except Exception:
-        return AppendOffset(offset=offset)
+        return Skip()
 
 
-def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg] | None]:
+def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg]]:
     match msg:
-        case Append(item=item, offset=offset):
+        case Append(item=item):
             model.buffer.append(item)
-            model.offsets.update(offset)
             return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now)))
-        case AppendOffset(offset=offset):
-            model.offsets.update(offset)
+        case Skip():
             return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now)))
         case Committed():
-            return (model, None)
+            return (model, Nothing())
         case Flush():
             return (model, Effect(fun=FlushBuffer(model), msg=lambda now: Flushed(now=now)))
         case Flushed(now=now):
             model.buffer = []
             model.last_flushed_at = now
-            return (model, Commit(msg=Committed(), offsets=model.offsets))
+            return (model, Commit(msg=Committed()))
         case TryFlush(now=now):
             return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing())
 
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index 6e92252e5f684a..c6b8ddd35de51a 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -71,15 +71,9 @@ def join(self, timeout: float | None = None) -> None:
 
 @dataclass(frozen=True)
 class Commit(Generic[Msg]):
-    """Instructs the RunTime to commit the message offsets.
-
-    Because the RunTime is based on a Kafka platform there is an expectation that the application
-    should trigger commits when necessary. While the application can't be sure its running Kafka
-    it can be sure its running on a Platform that requires offsets be committed.
-    """
+    """Instructs the RunTime to commit the message offsets."""
 
     msg: Msg
-    offsets: MutableMapping[Partition, int]
 
 
 @dataclass(frozen=True)
@@ -158,7 +152,7 @@ class RunTime(Generic[Model, Msg, Flags]):
     def __init__(
         self,
         init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]],
-        process: Callable[[Model, bytes, MutableMapping[Partition, int]], Msg | None],
+        process: Callable[[Model, bytes], Msg | None],
         subscription: Callable[[Model], list[Sub[Msg]]],
         update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]],
     ) -> None:
@@ -169,6 +163,7 @@ def __init__(
 
         self._commit: CommitProtocol | None = None
         self._model: Model | None = None
+        self._offsets: MutableMapping[Partition, int] = {}
         self._subscriptions: dict[str, Sub[Msg]] = {}
 
     @property
@@ -192,13 +187,8 @@ def setup(self, flags: Flags, commit: CommitProtocol) -> None:
         self._register_subscriptions()
 
     def submit(self, message: Message[KafkaPayload]) -> None:
-        self._handle_msg(
-            self.process(
-                self.model,
-                message.payload.value,
-                cast(MutableMapping[Partition, int], message.committable),
-            )
-        )
+        self._handle_msg(self.process(self.model, message.payload.value))
+        self._offsets = cast(MutableMapping[Partition, int], message.committable)
 
     def publish(self, sub_name: str):
         # For each new subscription event we re-register the subscribers in case anything within
@@ -231,8 +221,8 @@ def _handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
             return None
 
         match cmd:
-            case Commit(msg=msg, offsets=offsets):
-                self.commit(offsets)
+            case Commit(msg=msg):
+                self.commit(self._offsets)
                 return self._handle_msg(msg)
             case Effect(msg=msg, fun=fun):
                 return self._handle_msg(msg(fun()))
diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py
index 79663b114e78ce..c0201feebc81da 100644
--- a/tests/sentry/replays/unit/consumers/test_helpers.py
+++ b/tests/sentry/replays/unit/consumers/test_helpers.py
@@ -1,6 +1,5 @@
-from collections.abc import Mapping, MutableMapping
+from collections.abc import Mapping
 from datetime import datetime
-from typing import cast
 
 from arroyo.backends.kafka import KafkaPayload
 from arroyo.types import BrokerValue, Message, Partition, Topic
@@ -24,13 +23,7 @@ def _handle_msg(self, msg):
             msg = yield cmd
 
     def submit(self, message):
-        yield from self._handle_msg(
-            self.process(
-                self.model,
-                message.payload.value,
-                cast(MutableMapping[Partition, int], message.committable),
-            )
-        )
+        yield from self._handle_msg(self.process(self.model, message.payload.value))
 
 
 class MockSink:
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index b4ad74b8b2c062..54f3468b3280b0 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -2,8 +2,6 @@
 import uuid
 import zlib
 
-from arroyo.types import Partition
-from arroyo.types import Topic as ArroyoTopic
 from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
 
 from sentry.conf.types.kafka_definition import Topic, get_topic_codec
@@ -102,8 +100,6 @@ def test_end_to_end_message_processing():
     assert runtime.model.last_flushed_at == 1
     assert isinstance(cmd, Commit)
     assert isinstance(cmd.msg, Committed)
-    assert cmd.offsets == runtime.model.offsets
-    assert cmd.offsets == {Partition(topic=ArroyoTopic(name="a"), index=1): 2}
 
 
 def test_invalid_message_format():
@@ -114,7 +110,6 @@ def test_invalid_message_format():
     gen = runtime.submit(make_kafka_message(b"invalid"))
     cmd = next(gen)
     assert len(runtime.model.buffer) == 0
-    assert len(runtime.model.offsets) == 1
     assert isinstance(cmd, Effect)
     assert cmd.fun == time.time
     assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
@@ -135,7 +130,6 @@ def test_invalid_message_format():
     cmd = gen.send(cmd.msg(1))
     assert runtime.model.last_flushed_at == 1
     assert len(runtime.model.buffer) == 0
-    assert len(runtime.model.offsets) == 1  # offsets are retained
 
 
 def test_invalid_recording_json():
@@ -162,7 +156,6 @@ def test_invalid_recording_json():
     gen = runtime.submit(kafka_message)
     cmd = next(gen)
     assert len(runtime.model.buffer) == 1
-    assert len(runtime.model.offsets) == 1
     assert isinstance(cmd, Effect)
     assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
 
@@ -177,12 +170,10 @@ def test_invalid_recording_json():
     assert isinstance(cmd, Effect)
     assert cmd.fun == FlushBuffer(runtime.model)
 
-    # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid
-    # message is never revisited.
+    # Send the `Flushed` message with an arbitrary timestamp.
     cmd = gen.send(cmd.msg(1))
     assert runtime.model.last_flushed_at == 1
     assert len(runtime.model.buffer) == 0
-    assert len(runtime.model.offsets) == 1  # offsets are retained
 
 
 def test_missing_headers():
@@ -209,7 +200,6 @@ def test_missing_headers():
     gen = runtime.submit(kafka_message)
     cmd = next(gen)
     assert len(runtime.model.buffer) == 0
-    assert len(runtime.model.offsets) == 1
     assert isinstance(cmd, Effect)
     assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
 
@@ -224,12 +214,10 @@ def test_missing_headers():
     assert isinstance(cmd, Effect)
     assert cmd.fun == FlushBuffer(runtime.model)
 
-    # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid
-    # message is never revisited.
+    # Send the `Flushed` message with an arbitrary timestamp.
     cmd = gen.send(cmd.msg(1))
     assert runtime.model.last_flushed_at == 1
     assert len(runtime.model.buffer) == 0
-    assert len(runtime.model.offsets) == 1  # offsets are retained
 
 
 def test_buffer_full_semantics():

From 2215007a153265b15ec3d7b0da2d6d02c196745f Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 08:40:01 -0600
Subject: [PATCH 44/49] Fix typing

---
 src/sentry/replays/consumers/buffered/consumer.py     |  2 +-
 src/sentry/replays/consumers/buffered/platform.py     |  4 ++--
 tests/sentry/replays/unit/consumers/test_recording.py | 10 +++++-----
 tests/sentry/replays/unit/consumers/test_runtime.py   |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index ed5484c408b9cf..5beb1ccba52310 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -80,7 +80,7 @@ class TryFlush:
 
 
 # A "Msg" is the union of all application messages our RunTime will accept.
-Msg = Append | Committed | Flush | Flushed | TryFlush
+Msg = Append | Committed | Flush | Flushed | Skip | TryFlush
 
 
 # State machine functions.
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index c6b8ddd35de51a..2f0c12990966da 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -104,7 +104,7 @@ class Task(Generic[Msg]):
 # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime
 # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not
 # be registered on a per application basis.
-Cmd = Commit[Msg] | Effect[Msg] | Task[Msg]
+Cmd = Commit[Msg] | Effect[Msg] | Nothing | Task[Msg]
 
 
 @dataclass(frozen=True)
@@ -190,7 +190,7 @@ def submit(self, message: Message[KafkaPayload]) -> None:
         self._handle_msg(self.process(self.model, message.payload.value))
         self._offsets = cast(MutableMapping[Partition, int], message.committable)
 
-    def publish(self, sub_name: str):
+    def publish(self, sub_name: str) -> None:
         # For each new subscription event we re-register the subscribers in case anything within
         # the application has changed. I.e. the model is in some new state and that means we care
         # about a new subscription or don't care about an old one.
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 54f3468b3280b0..d69ea1eb8e22aa 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -34,7 +34,7 @@ def test_end_to_end_message_processing():
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b'{"segment_id":0}\n[]',
+        "payload": b'{"segment_id":0}\n[]',  # type: ignore[typeddict-item]
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
@@ -139,7 +139,7 @@ def test_invalid_recording_json():
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b'{"segment_id":0}\n[',
+        "payload": b'{"segment_id":0}\n[',  # type: ignore[typeddict-item]
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
@@ -183,7 +183,7 @@ def test_missing_headers():
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b"[]",
+        "payload": b"[]",  # type: ignore[typeddict-item]
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
@@ -226,7 +226,7 @@ def test_buffer_full_semantics():
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b'{"segment_id":0}\n[]',
+        "payload": b'{"segment_id":0}\n[]',  # type: ignore[typeddict-item]
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
@@ -268,7 +268,7 @@ def test_buffer_timeout():
     message: ReplayRecording = {
         "key_id": None,
         "org_id": 1,
-        "payload": b'{"segment_id":0}\n[]',
+        "payload": b'{"segment_id":0}\n[]',  # type: ignore[typeddict-item]
         "project_id": 1,
         "received": int(time.time()),
         "replay_event": None,
diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py
index c8ecfdbefabb64..f66e4b1fa70303 100644
--- a/tests/sentry/replays/unit/consumers/test_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_runtime.py
@@ -6,7 +6,7 @@ def counter_runtime() -> RunTime[int, str, None]:
     def init(_):
         return (22, None)
 
-    def process(_model, message, _offsets):
+    def process(_model, message):
         if message == b"incr":
             return "incr"
         elif message == b"decr":
@@ -28,8 +28,8 @@ def update(model, msg):
 
     def subscription(_):
         return [
-            Join(msg="join"),
-            Poll(msg="poll"),
+            Join(msg=lambda: "join"),
+            Poll(msg=lambda: "poll"),
         ]
 
     return RunTime(

From 4cc15d78687ff05b34330ba715e3b4a9803bc720 Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 08:51:54 -0600
Subject: [PATCH 45/49] Remove comments on offsets

---
 tests/sentry/replays/unit/consumers/test_recording.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index d69ea1eb8e22aa..a66b3c252f5515 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -93,8 +93,7 @@ def test_end_to_end_message_processing():
     assert cmd.fun == FlushBuffer(runtime.model)
     assert len(runtime.model.buffer) == 1
 
-    # Assert the successful flush triggers a commit command. Assert the model's offsets were
-    # committed and the buffer was reset.
+    # Assert the successful flush triggers a commit command.
     cmd = gen.send(cmd.msg(1))
     assert len(runtime.model.buffer) == 0
     assert runtime.model.last_flushed_at == 1
@@ -125,8 +124,7 @@ def test_invalid_message_format():
     assert isinstance(cmd, Effect)
     assert cmd.fun == FlushBuffer(runtime.model)
 
-    # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid
-    # message is never revisited.
+    # Send the `Flushed` message with an arbitrary timestamp.
     cmd = gen.send(cmd.msg(1))
     assert runtime.model.last_flushed_at == 1
     assert len(runtime.model.buffer) == 0

From 4063f5d67d491f5390bd9567bf926e7a668dabef Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 08:58:51 -0600
Subject: [PATCH 46/49] Remove none-type messages

---
 .../replays/consumers/buffered/platform.py    | 20 ++++++++-----------
 .../replays/unit/consumers/test_runtime.py    | 14 +++++++------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index 2f0c12990966da..300f6ce1a29002 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -151,10 +151,10 @@ class RunTime(Generic[Model, Msg, Flags]):
 
     def __init__(
         self,
-        init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]],
-        process: Callable[[Model, bytes], Msg | None],
+        init: Callable[[Flags], tuple[Model, Cmd[Msg]]],
+        process: Callable[[Model, bytes], Msg],
         subscription: Callable[[Model], list[Sub[Msg]]],
-        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]],
+        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg]]],
     ) -> None:
         self.init = init
         self.process = process
@@ -210,16 +210,12 @@ def publish(self, sub_name: str) -> None:
             case Poll(msg=msg):
                 return self._handle_msg(msg())
 
-    def _handle_msg(self, msg: Msg | None) -> None:
-        if msg:
-            model, cmd = self.update(self.model, msg)
-            self._model = model
-            self._handle_cmd(cmd)
-
-    def _handle_cmd(self, cmd: Cmd[Msg] | None) -> None:
-        if cmd is None:
-            return None
+    def _handle_msg(self, msg: Msg) -> None:
+        model, cmd = self.update(self.model, msg)
+        self._model = model
+        self._handle_cmd(cmd)
 
+    def _handle_cmd(self, cmd: Cmd[Msg]) -> None:
         match cmd:
             case Commit(msg=msg):
                 self.commit(self._offsets)
diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py
index f66e4b1fa70303..ed7933fc116ec9 100644
--- a/tests/sentry/replays/unit/consumers/test_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_runtime.py
@@ -1,4 +1,4 @@
-from sentry.replays.consumers.buffered.platform import Join, Poll, RunTime
+from sentry.replays.consumers.buffered.platform import Join, Nothing, Poll, RunTime
 from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
 
 
@@ -12,17 +12,19 @@ def process(_model, message):
         elif message == b"decr":
             return "decr"
         else:
-            return None
+            return "nothing"
 
     def update(model, msg):
         if msg == "incr":
-            return (model + 1, None)
+            return (model + 1, Nothing())
         elif msg == "decr":
-            return (model - 1, None)
+            return (model - 1, Nothing())
         elif msg == "join":
-            return (-10, None)
+            return (-10, Nothing())
         elif msg == "poll":
-            return (99, None)
+            return (99, Nothing())
+        elif msg == "nothing":
+            return (model, Nothing())
         else:
             raise ValueError("Unknown msg")
 

From ea64a5083fbff0a4c210fd4b6eb5fe84659dbf7b Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 15:53:26 -0600
Subject: [PATCH 47/49] Move offsets out of runtime and into platform strategy.
 Add support for arbitrary next step

---
 .../replays/consumers/buffered/consumer.py    |   9 +-
 .../replays/consumers/buffered/platform.py    | 106 ++++++++++--------
 .../replays/unit/consumers/test_helpers.py    |  36 ++----
 .../replays/unit/consumers/test_recording.py  |  31 ++---
 .../replays/unit/consumers/test_runtime.py    |  24 ++--
 5 files changed, 99 insertions(+), 107 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index 5beb1ccba52310..ecb90fdd9eb4bb 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -10,10 +10,11 @@
 
 from sentry.replays.consumers.buffered.platform import (
     Cmd,
-    Commit,
     Effect,
     Join,
+    NextStep,
     Nothing,
+    Out,
     Poll,
     RunTime,
     Sub,
@@ -86,7 +87,7 @@ class TryFlush:
 # State machine functions.
 
 
-def init(flags: Flags) -> tuple[Model, Cmd[Msg]]:
+def init(flags: Flags) -> tuple[Model, Cmd[Msg, Out]]:
     return (
         Model(
             buffer=[],
@@ -108,7 +109,7 @@ def process(_: Model, message: bytes) -> Msg:
         return Skip()
 
 
-def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg]]:
+def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, Out]]:
     match msg:
         case Append(item=item):
             model.buffer.append(item)
@@ -122,7 +123,7 @@ def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg]]:
         case Flushed(now=now):
             model.buffer = []
             model.last_flushed_at = now
-            return (model, Commit(msg=Committed()))
+            return (model, NextStep(msg=Committed(), value=None))
         case TryFlush(now=now):
             return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing())
 
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index 300f6ce1a29002..3d7939710e879a 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -1,57 +1,75 @@
 from collections.abc import Callable, MutableMapping
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Generic, TypeVar, cast
+from typing import Any, Generic, TypeVar
 
 from arroyo.backends.kafka.consumer import KafkaPayload
-from arroyo.processing.strategies import CommitOffsets, ProcessingStrategy
-from arroyo.types import Commit as ArroyoCommit
-from arroyo.types import Message, Partition, Value
+from arroyo.processing.strategies import MessageRejected, ProcessingStrategy
+from arroyo.types import FilteredPayload, Message, Partition, Value
 
+Out = TypeVar("Out")
+Input = FilteredPayload | KafkaPayload
 
-class PlatformStrategy(ProcessingStrategy[KafkaPayload]):
+
+class PlatformStrategy(ProcessingStrategy[Input], Generic[Out]):
 
     def __init__(
         self,
-        commit: ArroyoCommit,
         flags: "Flags",
-        runtime: "RunTime[Model, Msg, Flags]",
+        runtime: "RunTime[Model, Msg, Flags, Out]",
+        next_step: "ProcessingStrategy[Out]",
     ) -> None:
         # The RunTime is made aware of the commit strategy. It will
         # submit the partition, offset mapping it wants committed.
-        runtime.setup(flags, self.handle_commit_request)
+        runtime.setup(flags, self._handle_next_step)
 
-        self.__commit_step = CommitOffsets(commit)
         self.__closed = False
+        self.__next_step = next_step
+        self.__offsets: MutableMapping[Partition, int] = {}
         self.__runtime = runtime
 
-    def handle_commit_request(self, offsets: MutableMapping[Partition, int]) -> None:
-        self.__commit_step.submit(Message(Value(None, offsets, datetime.now())))
-
-    def submit(self, message: Message[KafkaPayload]) -> None:
+    # NOTE: Filtered payloads update the offsets but are not forwarded to the next step. My
+    # concern is that the filtered payload could have its offsets committed before the
+    # preceeding messages have their offsets committed. This is against what the Arroyo library
+    # does which is to forward everything.
+    def submit(self, message: Message[FilteredPayload | KafkaPayload]) -> None:
         assert not self.__closed
-        self.__runtime.submit(message)
+
+        if isinstance(message.payload, KafkaPayload):
+            self.__runtime.submit(message.payload.value)
+            self.__offsets.update(message.committable)
+        else:
+            self.__offsets.update(message.committable)
 
     def poll(self) -> None:
         assert not self.__closed
-        self.__runtime.publish("poll")
-        self.__commit_step.poll()
+
+        try:
+            self.__runtime.publish("poll")
+        except MessageRejected:
+            pass
+
+        self.__next_step.poll()
 
     def close(self) -> None:
         self.__closed = True
-        self.__commit_step.close()
 
     def terminate(self) -> None:
         self.__closed = True
-        self.__commit_step.terminate()
+        self.__next_step.terminate()
 
     def join(self, timeout: float | None = None) -> None:
-        self.__runtime.publish("join")
-        self.__commit_step.close()
-        self.__commit_step.join(timeout)
+        try:
+            self.__runtime.publish("join")
+        except MessageRejected:
+            pass
+
+        self.__next_step.close()
+        self.__next_step.join(timeout)
 
+    def _handle_next_step(self, value: Out) -> None:
+        self.__next_step.submit(Message(Value(value, self.__offsets, datetime.now())))
 
-CommitProtocol = Callable[[MutableMapping[Partition, int]], None]
 
 # A Model represents the state of your application. It is a type variable and the RunTime is
 # generic over it. Your state can be anything from a simple integer to a large class with many
@@ -65,15 +83,13 @@ def join(self, timeout: float | None = None) -> None:
 # A generic type representing the structure of the flags passed to the RunTime instance.
 Flags = TypeVar("Flags")
 
-# A generic type of unknown origins not tied to anything in the platform.
-T = TypeVar("T")
-
 
 @dataclass(frozen=True)
-class Commit(Generic[Msg]):
-    """Instructs the RunTime to commit the message offsets."""
+class NextStep(Generic[Msg, Out]):
+    """Instructs the RunTime to produce to the next step."""
 
     msg: Msg
+    value: Out
 
 
 @dataclass(frozen=True)
@@ -104,7 +120,7 @@ class Task(Generic[Msg]):
 # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime
 # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not
 # be registered on a per application basis.
-Cmd = Commit[Msg] | Effect[Msg] | Nothing | Task[Msg]
+Cmd = NextStep[Msg, Out] | Effect[Msg] | Nothing | Task[Msg]
 
 
 @dataclass(frozen=True)
@@ -139,7 +155,7 @@ class Poll(Generic[Msg]):
 Sub = Join[Msg] | Poll[Msg]
 
 
-class RunTime(Generic[Model, Msg, Flags]):
+class RunTime(Generic[Model, Msg, Flags, Out]):
     """RunTime object.
 
     The RunTime is an intermediate data structure which manages communication between the platform
@@ -151,44 +167,42 @@ class RunTime(Generic[Model, Msg, Flags]):
 
     def __init__(
         self,
-        init: Callable[[Flags], tuple[Model, Cmd[Msg]]],
+        init: Callable[[Flags], tuple[Model, Cmd[Msg, Out]]],
         process: Callable[[Model, bytes], Msg],
         subscription: Callable[[Model], list[Sub[Msg]]],
-        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg]]],
+        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg, Out]]],
     ) -> None:
         self.init = init
         self.process = process
         self.subscription = subscription
         self.update = update
 
-        self._commit: CommitProtocol | None = None
+        self._next_step: Callable[[Out], None] | None = None
         self._model: Model | None = None
-        self._offsets: MutableMapping[Partition, int] = {}
         self._subscriptions: dict[str, Sub[Msg]] = {}
 
-    @property
-    def commit(self) -> CommitProtocol:
-        assert self._commit is not None
-        return self._commit
-
     @property
     def model(self) -> Model:
         assert self._model is not None
         return self._model
 
+    @property
+    def next_step(self) -> Callable[[Out], None]:
+        assert self._next_step is not None
+        return self._next_step
+
     # NOTE: Could this be a factory function that produces RunTimes instead? That way we don't need
     # the assert checks on model and commit.
-    def setup(self, flags: Flags, commit: CommitProtocol) -> None:
-        self._commit = commit
+    def setup(self, flags: Flags, next_step: Callable[[Out], None]) -> None:
+        self._next_step = next_step
 
         model, cmd = self.init(flags)
         self._model = model
         self._handle_cmd(cmd)
         self._register_subscriptions()
 
-    def submit(self, message: Message[KafkaPayload]) -> None:
-        self._handle_msg(self.process(self.model, message.payload.value))
-        self._offsets = cast(MutableMapping[Partition, int], message.committable)
+    def submit(self, message: bytes) -> None:
+        self._handle_msg(self.process(self.model, message))
 
     def publish(self, sub_name: str) -> None:
         # For each new subscription event we re-register the subscribers in case anything within
@@ -215,10 +229,10 @@ def _handle_msg(self, msg: Msg) -> None:
         self._model = model
         self._handle_cmd(cmd)
 
-    def _handle_cmd(self, cmd: Cmd[Msg]) -> None:
+    def _handle_cmd(self, cmd: Cmd[Msg, Out]) -> None:
         match cmd:
-            case Commit(msg=msg):
-                self.commit(self._offsets)
+            case NextStep(msg=msg, value=value):
+                self.next_step(value)
                 return self._handle_msg(msg)
             case Effect(msg=msg, fun=fun):
                 return self._handle_msg(msg(fun()))
diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py
index c0201feebc81da..9d7743f9ab50a2 100644
--- a/tests/sentry/replays/unit/consumers/test_helpers.py
+++ b/tests/sentry/replays/unit/consumers/test_helpers.py
@@ -1,21 +1,7 @@
-from collections.abc import Mapping
-from datetime import datetime
+from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, Out, RunTime
 
-from arroyo.backends.kafka import KafkaPayload
-from arroyo.types import BrokerValue, Message, Partition, Topic
 
-from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, RunTime
-
-
-class MockCommit:
-    def __init__(self):
-        self.commit = {}
-
-    def __call__(self, offsets: Mapping[Partition, int], force: bool = False) -> None:
-        self.commit.update(offsets)
-
-
-class MockRunTime(RunTime[Model, Msg, Flags]):
+class MockRunTime(RunTime[Model, Msg, Flags, Out]):
     def _handle_msg(self, msg):
         while True:
             model, cmd = self.update(self.model, msg)
@@ -23,7 +9,15 @@ def _handle_msg(self, msg):
             msg = yield cmd
 
     def submit(self, message):
-        yield from self._handle_msg(self.process(self.model, message.payload.value))
+        yield from self._handle_msg(self.process(self.model, message))
+
+
+class MockNextStep:
+    def __init__(self):
+        self.values = []
+
+    def __call__(self, value):
+        self.values.append(value)
 
 
 class MockSink:
@@ -32,11 +26,3 @@ def __init__(self):
 
     def accept(self, buffer):
         self.accepted.extend(buffer)
-
-
-def make_kafka_message(message: bytes, topic: str = "a", index: int = 1, offset: int = 1):
-    return Message(
-        BrokerValue(
-            KafkaPayload(b"k", message, []), Partition(Topic(topic), index), offset, datetime.now()
-        )
-    )
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index a66b3c252f5515..32d5b847d6a3e8 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -15,14 +15,10 @@
     subscription,
     update,
 )
-from sentry.replays.consumers.buffered.platform import Commit, Effect, Nothing, Task
+from sentry.replays.consumers.buffered.platform import Effect, NextStep, Nothing, Task
 from sentry.replays.usecases.ingest import ProcessedRecordingMessage
 from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
-from tests.sentry.replays.unit.consumers.test_helpers import (
-    MockCommit,
-    MockRunTime,
-    make_kafka_message,
-)
+from tests.sentry.replays.unit.consumers.test_helpers import MockNextStep, MockRunTime
 
 RECORDINGS_CODEC = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
 
@@ -45,9 +41,8 @@ def test_end_to_end_message_processing():
         "version": 1,
     }
     message_bytes = RECORDINGS_CODEC.encode(message)
-    kafka_message = make_kafka_message(message_bytes)
 
-    gen = runtime.submit(kafka_message)
+    gen = runtime.submit(message_bytes)
 
     # Assert the runtime gets the current time after appending the message and then attempts to
     # flush the buffer with the current time.
@@ -97,7 +92,7 @@ def test_end_to_end_message_processing():
     cmd = gen.send(cmd.msg(1))
     assert len(runtime.model.buffer) == 0
     assert runtime.model.last_flushed_at == 1
-    assert isinstance(cmd, Commit)
+    assert isinstance(cmd, NextStep)
     assert isinstance(cmd.msg, Committed)
 
 
@@ -106,7 +101,7 @@ def test_invalid_message_format():
     runtime = _make_runtime()
 
     # We submit a message which can't be parsed and will not be buffered. Flush is not triggered.
-    gen = runtime.submit(make_kafka_message(b"invalid"))
+    gen = runtime.submit(b"invalid")
     cmd = next(gen)
     assert len(runtime.model.buffer) == 0
     assert isinstance(cmd, Effect)
@@ -148,10 +143,9 @@ def test_invalid_recording_json():
         "version": 1,
     }
     message_bytes = RECORDINGS_CODEC.encode(message)
-    kafka_message = make_kafka_message(message_bytes)
 
     # We submit a message which will not be buffered. Flush is not triggered.
-    gen = runtime.submit(kafka_message)
+    gen = runtime.submit(message_bytes)
     cmd = next(gen)
     assert len(runtime.model.buffer) == 1
     assert isinstance(cmd, Effect)
@@ -192,10 +186,9 @@ def test_missing_headers():
         "version": 1,
     }
     message_bytes = RECORDINGS_CODEC.encode(message)
-    kafka_message = make_kafka_message(message_bytes)
 
     # We submit a message which will not be buffered. Flush is not triggered.
-    gen = runtime.submit(kafka_message)
+    gen = runtime.submit(message_bytes)
     cmd = next(gen)
     assert len(runtime.model.buffer) == 0
     assert isinstance(cmd, Effect)
@@ -235,10 +228,9 @@ def test_buffer_full_semantics():
         "version": 1,
     }
     message_bytes = RECORDINGS_CODEC.encode(message)
-    kafka_message = make_kafka_message(message_bytes)
 
     # We submit a message which will be buffered but not flushed.
-    gen = runtime.submit(kafka_message)
+    gen = runtime.submit(message_bytes)
     cmd = next(gen)
     assert isinstance(cmd, Effect)
     assert cmd.fun == time.time
@@ -248,7 +240,7 @@ def test_buffer_full_semantics():
     assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing)
 
     # We submit another message which will be buffered and trigger a flush.
-    gen = runtime.submit(kafka_message)
+    gen = runtime.submit(message_bytes)
     cmd = next(gen)
     assert isinstance(cmd, Effect)
     assert cmd.fun == time.time
@@ -277,10 +269,9 @@ def test_buffer_timeout():
         "version": 1,
     }
     message_bytes = RECORDINGS_CODEC.encode(message)
-    kafka_message = make_kafka_message(message_bytes)
 
     # We submit a message which will be buffered but not flushed.
-    gen = runtime.submit(kafka_message)
+    gen = runtime.submit(message_bytes)
     cmd = next(gen)
     assert isinstance(cmd, Effect)
     assert cmd.fun == time.time
@@ -305,6 +296,6 @@ def _make_runtime():
             "max_buffer_wait": 1,
             "max_workers": 1,
         },
-        MockCommit(),
+        MockNextStep(),
     )
     return runtime
diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py
index ed7933fc116ec9..151141c387ed96 100644
--- a/tests/sentry/replays/unit/consumers/test_runtime.py
+++ b/tests/sentry/replays/unit/consumers/test_runtime.py
@@ -1,8 +1,8 @@
 from sentry.replays.consumers.buffered.platform import Join, Nothing, Poll, RunTime
-from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message
+from tests.sentry.replays.unit.consumers.test_helpers import MockNextStep
 
 
-def counter_runtime() -> RunTime[int, str, None]:
+def counter_runtime() -> RunTime[int, str, None, None]:
     def init(_):
         return (22, None)
 
@@ -44,37 +44,37 @@ def subscription(_):
 
 def test_runtime_setup():
     runtime = counter_runtime()
-    runtime.setup(None, commit=MockCommit())
+    runtime.setup(None, next_step=MockNextStep())
     assert runtime.model == 22
 
 
 def test_runtime_submit():
     # RunTime defaults to a start point of 22.
     runtime = counter_runtime()
-    runtime.setup(None, commit=MockCommit())
+    runtime.setup(None, next_step=MockNextStep())
     assert runtime.model == 22
 
     # Two incr commands increase the count by 2.
-    runtime.submit(make_kafka_message(b"incr"))
-    runtime.submit(make_kafka_message(b"incr"))
+    runtime.submit(b"incr")
+    runtime.submit(b"incr")
     assert runtime.model == 24
 
     # Four decr commands decrease the count by 4.
-    runtime.submit(make_kafka_message(b"decr"))
-    runtime.submit(make_kafka_message(b"decr"))
-    runtime.submit(make_kafka_message(b"decr"))
-    runtime.submit(make_kafka_message(b"decr"))
+    runtime.submit(b"decr")
+    runtime.submit(b"decr")
+    runtime.submit(b"decr")
+    runtime.submit(b"decr")
     assert runtime.model == 20
 
     # Messages which the application does not understand do nothing to the model.
-    runtime.submit(make_kafka_message(b"other"))
+    runtime.submit(b"other")
     assert runtime.model == 20
 
 
 def test_runtime_publish():
     # RunTime defaults to a start point of 22.
     runtime = counter_runtime()
-    runtime.setup(None, commit=MockCommit())
+    runtime.setup(None, next_step=MockNextStep())
     assert runtime.model == 22
 
     # A join event updates the model and sets it to -10.

From 62d74337ab21a327c1dd59078bddd8caeec76c3c Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 16:07:28 -0600
Subject: [PATCH 48/49] Fix typing

---
 .../replays/consumers/buffered/consumer.py    |  5 ++-
 .../replays/consumers/buffered/factory.py     | 10 +++---
 .../replays/consumers/buffered/platform.py    | 32 +++++++++----------
 .../replays/unit/consumers/test_helpers.py    |  4 +--
 4 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index ecb90fdd9eb4bb..ed93363489b403 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -14,7 +14,6 @@
     Join,
     NextStep,
     Nothing,
-    Out,
     Poll,
     RunTime,
     Sub,
@@ -87,7 +86,7 @@ class TryFlush:
 # State machine functions.
 
 
-def init(flags: Flags) -> tuple[Model, Cmd[Msg, Out]]:
+def init(flags: Flags) -> tuple[Model, Cmd[Msg, None]]:
     return (
         Model(
             buffer=[],
@@ -109,7 +108,7 @@ def process(_: Model, message: bytes) -> Msg:
         return Skip()
 
 
-def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, Out]]:
+def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, None]]:
     match msg:
         case Append(item=item):
             model.buffer.append(item)
diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py
index 56566d449d2053..5cc3ba59bbecea 100644
--- a/src/sentry/replays/consumers/buffered/factory.py
+++ b/src/sentry/replays/consumers/buffered/factory.py
@@ -8,8 +8,8 @@
 
 from arroyo.backends.kafka.consumer import KafkaPayload
 from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory
-from arroyo.types import Commit as ArroyoCommit
-from arroyo.types import Partition
+from arroyo.processing.strategies.commit import CommitOffsets
+from arroyo.types import Commit, Partition
 
 from sentry.replays.consumers.buffered.consumer import Flags, recording_consumer
 from sentry.replays.consumers.buffered.platform import PlatformStrategy
@@ -26,7 +26,9 @@ def __init__(self, max_buffer_length: int, max_buffer_wait: int, max_workers: in
 
     def create_with_partitions(
         self,
-        commit: ArroyoCommit,
+        commit: Commit,
         partitions: Mapping[Partition, int],
     ) -> ProcessingStrategy[KafkaPayload]:
-        return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_consumer)
+        return PlatformStrategy(
+            next_step=CommitOffsets(commit), flags=self.flags, runtime=recording_consumer
+        )
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index 3d7939710e879a..6639cade87da25 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -7,17 +7,17 @@
 from arroyo.processing.strategies import MessageRejected, ProcessingStrategy
 from arroyo.types import FilteredPayload, Message, Partition, Value
 
-Out = TypeVar("Out")
-Input = FilteredPayload | KafkaPayload
+Output = TypeVar("Output")
+Input = KafkaPayload
 
 
-class PlatformStrategy(ProcessingStrategy[Input], Generic[Out]):
+class PlatformStrategy(ProcessingStrategy[FilteredPayload | Input], Generic[Output]):
 
     def __init__(
         self,
         flags: "Flags",
-        runtime: "RunTime[Model, Msg, Flags, Out]",
-        next_step: "ProcessingStrategy[Out]",
+        runtime: "RunTime[Model, Msg, Flags, Output]",
+        next_step: "ProcessingStrategy[Output]",
     ) -> None:
         # The RunTime is made aware of the commit strategy. It will
         # submit the partition, offset mapping it wants committed.
@@ -67,7 +67,7 @@ def join(self, timeout: float | None = None) -> None:
         self.__next_step.close()
         self.__next_step.join(timeout)
 
-    def _handle_next_step(self, value: Out) -> None:
+    def _handle_next_step(self, value: Output) -> None:
         self.__next_step.submit(Message(Value(value, self.__offsets, datetime.now())))
 
 
@@ -85,11 +85,11 @@ def _handle_next_step(self, value: Out) -> None:
 
 
 @dataclass(frozen=True)
-class NextStep(Generic[Msg, Out]):
+class NextStep(Generic[Msg, Output]):
     """Instructs the RunTime to produce to the next step."""
 
     msg: Msg
-    value: Out
+    value: Output
 
 
 @dataclass(frozen=True)
@@ -120,7 +120,7 @@ class Task(Generic[Msg]):
 # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime
 # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not
 # be registered on a per application basis.
-Cmd = NextStep[Msg, Out] | Effect[Msg] | Nothing | Task[Msg]
+Cmd = NextStep[Msg, Output] | Effect[Msg] | Nothing | Task[Msg]
 
 
 @dataclass(frozen=True)
@@ -155,7 +155,7 @@ class Poll(Generic[Msg]):
 Sub = Join[Msg] | Poll[Msg]
 
 
-class RunTime(Generic[Model, Msg, Flags, Out]):
+class RunTime(Generic[Model, Msg, Flags, Output]):
     """RunTime object.
 
     The RunTime is an intermediate data structure which manages communication between the platform
@@ -167,17 +167,17 @@ class RunTime(Generic[Model, Msg, Flags, Out]):
 
     def __init__(
         self,
-        init: Callable[[Flags], tuple[Model, Cmd[Msg, Out]]],
+        init: Callable[[Flags], tuple[Model, Cmd[Msg, Output]]],
         process: Callable[[Model, bytes], Msg],
         subscription: Callable[[Model], list[Sub[Msg]]],
-        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg, Out]]],
+        update: Callable[[Model, Msg], tuple[Model, Cmd[Msg, Output]]],
     ) -> None:
         self.init = init
         self.process = process
         self.subscription = subscription
         self.update = update
 
-        self._next_step: Callable[[Out], None] | None = None
+        self._next_step: Callable[[Output], None] | None = None
         self._model: Model | None = None
         self._subscriptions: dict[str, Sub[Msg]] = {}
 
@@ -187,13 +187,13 @@ def model(self) -> Model:
         return self._model
 
     @property
-    def next_step(self) -> Callable[[Out], None]:
+    def next_step(self) -> Callable[[Output], None]:
         assert self._next_step is not None
         return self._next_step
 
     # NOTE: Could this be a factory function that produces RunTimes instead? That way we don't need
     # the assert checks on model and commit.
-    def setup(self, flags: Flags, next_step: Callable[[Out], None]) -> None:
+    def setup(self, flags: Flags, next_step: Callable[[Output], None]) -> None:
         self._next_step = next_step
 
         model, cmd = self.init(flags)
@@ -229,7 +229,7 @@ def _handle_msg(self, msg: Msg) -> None:
         self._model = model
         self._handle_cmd(cmd)
 
-    def _handle_cmd(self, cmd: Cmd[Msg, Out]) -> None:
+    def _handle_cmd(self, cmd: Cmd[Msg, Output]) -> None:
         match cmd:
             case NextStep(msg=msg, value=value):
                 self.next_step(value)
diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py
index 9d7743f9ab50a2..4ec8fc240f064f 100644
--- a/tests/sentry/replays/unit/consumers/test_helpers.py
+++ b/tests/sentry/replays/unit/consumers/test_helpers.py
@@ -1,7 +1,7 @@
-from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, Out, RunTime
+from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, Output, RunTime
 
 
-class MockRunTime(RunTime[Model, Msg, Flags, Out]):
+class MockRunTime(RunTime[Model, Msg, Flags, Output]):
     def _handle_msg(self, msg):
         while True:
             model, cmd = self.update(self.model, msg)

From 895185b9913cde30ab449403003843d2fe5269cd Mon Sep 17 00:00:00 2001
From: Colton Allen <colton.allen@sentry.io>
Date: Mon, 3 Mar 2025 16:26:36 -0600
Subject: [PATCH 49/49] Docstrings and renames

---
 src/sentry/replays/consumers/buffered/consumer.py     | 11 ++++++++---
 src/sentry/replays/consumers/buffered/platform.py     | 10 +++-------
 tests/sentry/replays/unit/consumers/test_recording.py |  4 ++--
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py
index ed93363489b403..5ad55b55631ccc 100644
--- a/src/sentry/replays/consumers/buffered/consumer.py
+++ b/src/sentry/replays/consumers/buffered/consumer.py
@@ -1,4 +1,9 @@
-"""Session Replay recording consumer implementation."""
+"""Session Replay recording consumer implementation.
+
+The consumer implementation follows a batching flush strategy. We accept messages, process them,
+buffer them, and when some threshold is reached we flush the buffer. The batch has finished work
+after the buffer is flushed so we commit with a None value.
+"""
 
 import contextlib
 import time
@@ -10,9 +15,9 @@
 
 from sentry.replays.consumers.buffered.platform import (
     Cmd,
+    Commit,
     Effect,
     Join,
-    NextStep,
     Nothing,
     Poll,
     RunTime,
@@ -122,7 +127,7 @@ def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, None]]:
         case Flushed(now=now):
             model.buffer = []
             model.last_flushed_at = now
-            return (model, NextStep(msg=Committed(), value=None))
+            return (model, Commit(msg=Committed(), value=None))
         case TryFlush(now=now):
             return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing())
 
diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py
index 6639cade87da25..bf07962b3532d9 100644
--- a/src/sentry/replays/consumers/buffered/platform.py
+++ b/src/sentry/replays/consumers/buffered/platform.py
@@ -28,10 +28,6 @@ def __init__(
         self.__offsets: MutableMapping[Partition, int] = {}
         self.__runtime = runtime
 
-    # NOTE: Filtered payloads update the offsets but are not forwarded to the next step. My
-    # concern is that the filtered payload could have its offsets committed before the
-    # preceeding messages have their offsets committed. This is against what the Arroyo library
-    # does which is to forward everything.
     def submit(self, message: Message[FilteredPayload | KafkaPayload]) -> None:
         assert not self.__closed
 
@@ -85,7 +81,7 @@ def _handle_next_step(self, value: Output) -> None:
 
 
 @dataclass(frozen=True)
-class NextStep(Generic[Msg, Output]):
+class Commit(Generic[Msg, Output]):
     """Instructs the RunTime to produce to the next step."""
 
     msg: Msg
@@ -120,7 +116,7 @@ class Task(Generic[Msg]):
 # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime
 # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not
 # be registered on a per application basis.
-Cmd = NextStep[Msg, Output] | Effect[Msg] | Nothing | Task[Msg]
+Cmd = Commit[Msg, Output] | Effect[Msg] | Nothing | Task[Msg]
 
 
 @dataclass(frozen=True)
@@ -231,7 +227,7 @@ def _handle_msg(self, msg: Msg) -> None:
 
     def _handle_cmd(self, cmd: Cmd[Msg, Output]) -> None:
         match cmd:
-            case NextStep(msg=msg, value=value):
+            case Commit(msg=msg, value=value):
                 self.next_step(value)
                 return self._handle_msg(msg)
             case Effect(msg=msg, fun=fun):
diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py
index 32d5b847d6a3e8..c58f3627044e41 100644
--- a/tests/sentry/replays/unit/consumers/test_recording.py
+++ b/tests/sentry/replays/unit/consumers/test_recording.py
@@ -15,7 +15,7 @@
     subscription,
     update,
 )
-from sentry.replays.consumers.buffered.platform import Effect, NextStep, Nothing, Task
+from sentry.replays.consumers.buffered.platform import Commit, Effect, Nothing, Task
 from sentry.replays.usecases.ingest import ProcessedRecordingMessage
 from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta
 from tests.sentry.replays.unit.consumers.test_helpers import MockNextStep, MockRunTime
@@ -92,7 +92,7 @@ def test_end_to_end_message_processing():
     cmd = gen.send(cmd.msg(1))
     assert len(runtime.model.buffer) == 0
     assert runtime.model.last_flushed_at == 1
-    assert isinstance(cmd, NextStep)
+    assert isinstance(cmd, Commit)
     assert isinstance(cmd.msg, Committed)