From b8cb41384281acb278841331735ff4a4dd9d7ffd Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 18 Feb 2025 08:53:01 -0600 Subject: [PATCH 01/49] Add buffered consumer runtime implementation --- .../replays/consumers/buffered/__init__.py | 0 .../replays/consumers/buffered/consumer.py | 45 +++++ src/sentry/replays/consumers/buffered/lib.py | 100 +++++++++++ .../replays/consumers/buffered/platform.py | 158 ++++++++++++++++++ 4 files changed, 303 insertions(+) create mode 100644 src/sentry/replays/consumers/buffered/__init__.py create mode 100644 src/sentry/replays/consumers/buffered/consumer.py create mode 100644 src/sentry/replays/consumers/buffered/lib.py create mode 100644 src/sentry/replays/consumers/buffered/platform.py diff --git a/src/sentry/replays/consumers/buffered/__init__.py b/src/sentry/replays/consumers/buffered/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py new file mode 100644 index 00000000000000..761873edc1318a --- /dev/null +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -0,0 +1,45 @@ +import dataclasses +import time + +from sentry.replays.consumers.buffered.lib import Model, buffering_runtime + + +class Flusher: + + def __init__(self, flags: dict[str, str]) -> None: + self.max_buffer_length = int(flags["max_buffer_length"]) + self.max_buffer_wait = int(flags["max_buffer_wait"]) + self.__last_flushed_at = time.time() + + def can_flush(self) -> bool: + return (time.time() - self.max_buffer_wait) >= self.__last_flushed_at + + def do_flush(self) -> None: + self.__last_flushed_at = time.time() + # TODO: upload the bytes or something. + + +@dataclasses.dataclass +class Item: + recording: bytes + + +def init(flags: dict[str, str]) -> Model[Item]: + flusher = Flusher(flags) + return Model( + buffer=[], + can_flush=flusher.can_flush, + do_flush=flusher.do_flush, + offsets={}, + retries=0, + ) + + +def process_message(message: bytes) -> Item: + return Item(recording=message) + + +recording_consumer = buffering_runtime( + init_fn=init, + process_fn=process_message, +) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py new file mode 100644 index 00000000000000..7aece6f5012d8e --- /dev/null +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -0,0 +1,100 @@ +"""Buffered RunTime implementation.""" + +from collections.abc import Callable, MutableMapping +from dataclasses import dataclass +from functools import partial +from typing import Generic, TypeVar + +from arroyo.types import Partition + +from sentry.replays.consumers.buffered.platform import Cmd, Commit, Nothing, RunTime, Task + +Item = TypeVar("Item") +T = TypeVar("T") + + +@dataclass +class Model(Generic[Item]): + buffer: list[Item] + can_flush: Callable[[], bool] + do_flush: Callable[[], None] + offsets: MutableMapping[Partition, int] + retries: int + + +class Msg(Generic[Item]): + pass + + +@dataclass(frozen=True) +class Append(Msg[Item]): + item: Item + offset: MutableMapping[Partition, int] + + +class Committed(Msg[Item]): + pass + + +class Flush(Msg[Item]): + pass + + +def process( + process_fn: Callable[[bytes], Item], + model: Model[Item], + message: bytes, + offset: MutableMapping[Partition, int], +) -> Msg[Item] | None: + return Append(item=process_fn(message), offset=offset) + + +def init( + init_fn: Callable[[dict[str, str]], Model[Item]], + flags: dict[str, str], +) -> tuple[Model[Item], Cmd[Msg[Item]]]: + return (init_fn(flags), Nothing()) + + +def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Item]] | None]: + match msg: + case Append(item=item, offset=offset): + model.buffer.append(item) + model.offsets.update(offset) + if model.can_flush(): + return (model, Task(msg=Flush())) + else: + return (model, None) + case Flush(): + # What should happen if we fail? If you raise an exception the platform will restart + # from the last checkpoint -- which is standard behavior. We could be more clever here + # and provide error handling facilities or we could accept that this problem gets too + # complicated to reasonably abstract and have developers implement their own buffering + # consumer. + model.do_flush() + model.buffer = [] + return (model, Commit(msg=Committed(), offsets=model.offsets)) + case Committed(): + return (model, None) + + # Satisfy mypy. Apparently we don't do exhaustiveness checks in our configuration. + return (model, None) + + +def subscription(model: Model[Item]) -> Msg[Item] | None: + if model.can_flush(): + return Flush() + else: + return None + + +def buffering_runtime( + init_fn: Callable[[dict[str, str]], Model[Item]], + process_fn: Callable[[bytes], Item], +) -> RunTime[Model[Item], Msg[Item]]: + return RunTime( + init=partial(init, init_fn), + process=partial(process, process_fn), + subscription=subscription, + update=update, + ) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py new file mode 100644 index 00000000000000..a1c93dd9156199 --- /dev/null +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -0,0 +1,158 @@ +from collections.abc import Callable, Mapping, MutableMapping +from dataclasses import dataclass +from datetime import datetime +from typing import Generic, TypeVar + +from arroyo.backends.kafka.consumer import KafkaPayload +from arroyo.processing.strategies import ( + CommitOffsets, + ProcessingStrategy, + ProcessingStrategyFactory, +) +from arroyo.types import Commit as ArroyoCommit +from arroyo.types import Message, Partition, Value + + +class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]): + + def __init__(self, flags: dict[str, str], runtime: "RunTime[Model, Msg]") -> None: + self.flags = flags + self.runtime = runtime + + def create_with_partitions( + self, + commit: ArroyoCommit, + partitions: Mapping[Partition, int], + ) -> ProcessingStrategy[KafkaPayload]: + return PlatformStrategy(commit=commit, flags=self.flags, runtime=self.runtime) + + +class PlatformStrategy(ProcessingStrategy[KafkaPayload]): + + def __init__( + self, + commit: ArroyoCommit, + flags: dict[str, str], + runtime: "RunTime[Model, Msg]", + ) -> None: + # The RunTime is made aware of the commit strategy. It will + # submit the partition, offset mapping it wants committed. + runtime.setup(flags, self.handle_commit_request) + + self.__commit_step = CommitOffsets(commit) + self.__closed = False + self.__runtime = runtime + + def handle_commit_request(self, offsets: Mapping[Partition, int]) -> None: + self.__commit_step.submit(Message(Value(None, offsets, datetime.now()))) + + def submit(self, message: Message[KafkaPayload]) -> None: + assert not self.__closed + self.__runtime.submit(message) + + def poll(self) -> None: + assert not self.__closed + self.__commit_step.poll() + + def close(self) -> None: + self.__closed = True + self.__commit_step.close() + + def terminate(self) -> None: + self.__closed = True + self.__commit_step.terminate() + + def join(self, timeout: float | None = None) -> None: + self.__commit_step.join(timeout) + + +CommitProtocol = Callable[[Mapping[Partition, int]], None] +Model = TypeVar("Model") +Msg = TypeVar("Msg") + + +class Cmd(Generic[Msg]): + pass + + +@dataclass(frozen=True) +class BackPressure(Cmd[Msg]): + pass + + +@dataclass(frozen=True) +class Commit(Cmd[Msg]): + msg: Msg + offsets: MutableMapping[Partition, int] + + +class Nothing(Cmd[Msg]): + pass + + +@dataclass(frozen=True) +class Task(Cmd[Msg]): + msg: Msg + + +class RunTime(Generic[Model, Msg]): + + def __init__( + self, + init: Callable[[dict[str, str]], tuple[Model, Cmd[Msg] | None]], + process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None], + subscription: Callable[[Model], Msg | None], + update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]], + ) -> None: + self.init = init + self.process = process + self.subscription = subscription + self.update = update + + self.__commit: CommitProtocol | None = None + self.__model: Model | None = None + + @property + def commit(self) -> CommitProtocol: + assert self.__commit is not None + return self.__commit + + @property + def model(self) -> Model: + assert self.__model is not None + return self.__model + + def poll(self) -> None: + self.__handle_msg(self.subscription(self.model)) + + def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None: + self.__commit = commit + + model, cmd = self.init(flags) + self.__model = model + self.__handle_cmd(cmd) + + def submit(self, message: Message[KafkaPayload]) -> None: + self.__handle_msg(self.process(self.model, message.payload.value, message.committable)) + + def __handle_msg(self, msg: Msg | None) -> None: + if msg: + model, cmd = self.update(self.model, msg) + self.__model = model + self.__handle_cmd(cmd) + + def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None: + if cmd is None: + return None + + match cmd: + case BackPressure(): + # TODO + ... + case Commit(msg=msg, offsets=offsets): + self.commit(offsets) + return self.__handle_msg(msg) + case Nothing(): + return None + case Task(msg=msg): + return self.__handle_msg(msg) From d82c589b8d961508fb7aea6fef41ebcfcb108bbd Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 18 Feb 2025 20:27:34 -0600 Subject: [PATCH 02/49] Begin adding process logic --- .../replays/consumers/buffered/consumer.py | 259 ++++++++++++++++-- src/sentry/replays/consumers/buffered/lib.py | 9 +- .../replays/usecases/ingest/dom_index.py | 5 +- 3 files changed, 246 insertions(+), 27 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 761873edc1318a..103864ef99089c 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,10 +1,96 @@ +from sentry.replays.consumers.buffered.lib import Model, buffering_runtime + +# RunTime implementation. + + +def init(flags: dict[str, str]) -> Model["Item"]: + buffer = Buffer(flags) + return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) + + +def process_message(message: bytes) -> "Item" | None: + return _process_message(message) + + +recording_consumer = buffering_runtime( + init_fn=init, + process_fn=process_message, +) + + +# Business-logic implementation. import dataclasses +import logging import time +import uuid +import zlib +from typing import cast + +import sentry_sdk +from django.conf import settings +from sentry_kafka_schemas.codecs import Codec, ValidationError +from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording + +from sentry.conf.types.kafka_definition import Topic, get_topic_codec +from sentry.replays.lib.storage import ( + RecordingSegmentStorageMeta, + make_recording_filename, + make_video_filename, + storage_kv, +) +from sentry.replays.usecases.ingest import ( + RecordingIngestMessage, + _report_size_metrics, + process_headers, + recording_post_processor, + track_initial_segment_event, +) +from sentry.replays.usecases.ingest.dom_index import ReplayActionsEvent, _initialize_publisher +from sentry.utils import json + +logger = logging.getLogger(__name__) + +RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) + + +@dataclasses.dataclass(frozen=True) +class EventMetadata: + is_replay_video: bool + key_id: int | None + org_id: int + project_id: int + received: int + replay_id: str + retention_days: int + segment_id: int + + +@dataclasses.dataclass(frozen=True) +class Item: + action_events: list[ReplayActionsEvent] + key_name: str + metadata: EventMetadata + recording: bytes + video: bytes | None + + +@dataclasses.dataclass(frozen=True) +class FilePartRow: + key: str + range_start: int + range_stop: int -from sentry.replays.consumers.buffered.lib import Model, buffering_runtime +@dataclasses.dataclass(frozen=True) +class MergedBuffer: + action_events: list[ReplayActionsEvent] + billed_events: list[EventMetadata] + buffer: bytes + buffer_rows: list[FilePartRow] + remote_key: str -class Flusher: + +class Buffer: def __init__(self, flags: dict[str, str]) -> None: self.max_buffer_length = int(flags["max_buffer_length"]) @@ -14,32 +100,163 @@ def __init__(self, flags: dict[str, str]) -> None: def can_flush(self) -> bool: return (time.time() - self.max_buffer_wait) >= self.__last_flushed_at - def do_flush(self) -> None: + def do_flush(self, model: Model[Item]) -> None: + merged_buffer = _merge_buffer(model.buffer) + _commit_merged_buffer(merged_buffer) self.__last_flushed_at = time.time() - # TODO: upload the bytes or something. -@dataclasses.dataclass -class Item: - recording: bytes +def _merge_buffer(buffer: list[Item]) -> MergedBuffer: + action_events = [] + billed_events: list[EventMetadata] = [] + parts = b"" + parts_rows = [] + remote_key = uuid.uuid4().hex + + for item in buffer: + # Extend the action events with whatever exists on the buffer item. + action_events.extend(item.action_events) + # Segment 0 events which are not replay-video are billed. + if item.metadata.segment_id == 0 and not item.metadata.is_replay_video: + billed_events.append(item.metadata) -def init(flags: dict[str, str]) -> Model[Item]: - flusher = Flusher(flags) - return Model( - buffer=[], - can_flush=flusher.can_flush, - do_flush=flusher.do_flush, - offsets={}, - retries=0, + # Append the recording and video if applicable to the zipped file. + meta = RecordingSegmentStorageMeta( + project_id=item.metadata.project_id, + replay_id=item.metadata.replay_id, + segment_id=item.metadata.segment_id, + retention_days=item.metadata.retention_days, + ) + + parts, part_row = _append_part(parts, item.recording, key=make_recording_filename(meta)) + parts_rows.append(part_row) + + if item.video: + parts, part_row = _append_part(parts, item.video, key=make_video_filename(meta)) + parts_rows.append(part_row) + + return MergedBuffer( + action_events=action_events, + billed_events=billed_events, + buffer=parts, + buffer_rows=parts_rows, + remote_key=remote_key, ) -def process_message(message: bytes) -> Item: - return Item(recording=message) +def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]: + range_start = len(parts) + range_stop = range_start + len(part) - 1 + return (parts + part, FilePartRow(key, range_start, range_stop)) -recording_consumer = buffering_runtime( - init_fn=init, - process_fn=process_message, -) +def _commit_merged_buffer(buffer: MergedBuffer) -> None: + # Empty buffer's are not committed. + if buffer.buffer == b"": + return None + + # Upload recording. + storage_kv.set(buffer.remote_key, buffer.buffer) + + # Write rows. + # TODO: bulk insert rows + + # Emit billing. + for event in buffer.billed_events: + track_initial_segment_event( + event.org_id, + event.project_id, + event.replay_id, + event.key_id, + event.received, + event.is_replay_video, + ) + + # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The + # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed + # before we can consider this buffer fully flushed and commit our local offsets. + publisher = _initialize_publisher(asynchronous=True) + for event in buffer.action_events: + publisher.publish("ingest-replay-events", json.dumps(event)) + publisher.flush() + + return None + + +def _process_message(message: bytes) -> Item | None: + transaction = sentry_sdk.start_transaction( + name="replays.consumer.process_recording", + op="replays.consumer", + custom_sampling_context={ + "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0) + }, + ) + + with transaction.start_child( + op="replays.consumers.buffered.process_message", name="ingest_recording" + ): + # set_tag("org_id", message.org_id) + # set_tag("project_id", message.project_id) + + try: + message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message.payload.value) + except ValidationError: + logger.exception("Could not decode recording message.") + return None + + message = RecordingIngestMessage( + replay_id=message_dict["replay_id"], + key_id=message_dict.get("key_id"), + org_id=message_dict["org_id"], + project_id=message_dict["project_id"], + received=message_dict["received"], + retention_days=message_dict["retention_days"], + payload_with_headers=cast(bytes, message_dict["payload"]), + replay_event=cast(bytes | None, message_dict.get("replay_event")), + replay_video=cast(bytes | None, message_dict.get("replay_video")), + ) + + try: + headers, compressed_segment = process_headers(message.payload_with_headers) + except Exception: + logger.exception("Recording headers could not be extracted %s", message.replay_id) + return None + + # Segment is decompressed for further analysis. Packed format expects + # concatenated, uncompressed bytes. + try: + recording_segment = zlib.decompress(compressed_segment) + _report_size_metrics(len(compressed_segment), len(recording_segment)) + except zlib.error: + if compressed_segment[0] == ord("["): + recording_segment = compressed_segment + compressed_segment = zlib.compress(compressed_segment) # Save storage $$$ + else: + logger.exception("Invalid recording body.") + return None + + recording_post_processor( + message, headers, recording_segment, message.replay_event, transaction + ) + + +@sentry_sdk.trace +def _decode_recording_message(message: bytes) -> RecordingIngestMessage | None: + try: + message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message) + except ValidationError: + logger.exception("Could not decode recording message.") + return None + + return RecordingIngestMessage( + replay_id=message_dict["replay_id"], + key_id=message_dict.get("key_id"), + org_id=message_dict["org_id"], + project_id=message_dict["project_id"], + received=message_dict["received"], + retention_days=message_dict["retention_days"], + payload_with_headers=cast(bytes, message_dict["payload"]), + replay_event=cast(bytes | None, message_dict.get("replay_event")), + replay_video=cast(bytes | None, message_dict.get("replay_video")), + ) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index 7aece6f5012d8e..c6e52a7b29979b 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -19,7 +19,6 @@ class Model(Generic[Item]): can_flush: Callable[[], bool] do_flush: Callable[[], None] offsets: MutableMapping[Partition, int] - retries: int class Msg(Generic[Item]): @@ -41,12 +40,14 @@ class Flush(Msg[Item]): def process( - process_fn: Callable[[bytes], Item], + process_fn: Callable[[bytes], Item | None], model: Model[Item], message: bytes, offset: MutableMapping[Partition, int], ) -> Msg[Item] | None: - return Append(item=process_fn(message), offset=offset) + item = process_fn(message) + if item: + return Append(item=item, offset=offset) def init( @@ -90,7 +91,7 @@ def subscription(model: Model[Item]) -> Msg[Item] | None: def buffering_runtime( init_fn: Callable[[dict[str, str]], Model[Item]], - process_fn: Callable[[bytes], Item], + process_fn: Callable[[bytes], Item | None], ) -> RunTime[Model[Item], Msg[Item]]: return RunTime( init=partial(init, init_fn), diff --git a/src/sentry/replays/usecases/ingest/dom_index.py b/src/sentry/replays/usecases/ingest/dom_index.py index f80eb37abac26b..88ec9bb5f9635e 100644 --- a/src/sentry/replays/usecases/ingest/dom_index.py +++ b/src/sentry/replays/usecases/ingest/dom_index.py @@ -233,13 +233,14 @@ def _get_testid(container: dict[str, str]) -> str: ) -def _initialize_publisher() -> KafkaPublisher: +def _initialize_publisher(asynchronous: bool = True) -> KafkaPublisher: global replay_publisher if replay_publisher is None: config = kafka_config.get_topic_definition(Topic.INGEST_REPLAY_EVENTS) replay_publisher = KafkaPublisher( - kafka_config.get_kafka_producer_cluster_options(config["cluster"]) + kafka_config.get_kafka_producer_cluster_options(config["cluster"]), + asynchronous=asynchronous, ) return replay_publisher From e82ce0f84bba0393f1cc4a00cc730934a7469cab Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Wed, 19 Feb 2025 08:10:52 -0600 Subject: [PATCH 03/49] Add tracing to each component of recording processing --- src/sentry/replays/consumers/recording.py | 69 +----- .../replays/consumers/recording_buffered.py | 21 +- .../replays/usecases/ingest/__init__.py | 219 ++++++++++-------- .../replays/usecases/ingest/dom_index.py | 4 + 4 files changed, 141 insertions(+), 172 deletions(-) diff --git a/src/sentry/replays/consumers/recording.py b/src/sentry/replays/consumers/recording.py index 5951dacacefb46..f384f99edc860a 100644 --- a/src/sentry/replays/consumers/recording.py +++ b/src/sentry/replays/consumers/recording.py @@ -1,5 +1,4 @@ import dataclasses -import logging from collections.abc import Mapping from typing import Any @@ -9,19 +8,11 @@ from arroyo.processing.strategies.abstract import ProcessingStrategy, ProcessingStrategyFactory from arroyo.processing.strategies.commit import CommitOffsets from arroyo.types import Commit, Message, Partition -from django.conf import settings -from sentry_kafka_schemas.codecs import Codec, ValidationError -from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording from sentry_sdk.tracing import Span -from sentry.conf.types.kafka_definition import Topic, get_topic_codec from sentry.replays.usecases.ingest import ingest_recording from sentry.utils.arroyo import MultiprocessingPool, run_task_with_multiprocessing -logger = logging.getLogger(__name__) - -RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) - @dataclasses.dataclass class MessageContext: @@ -86,14 +77,11 @@ def create_with_partitions( ) else: # By default we preserve the previous behavior. - return RunTask( - function=initialize_threaded_context, - next_step=RunTaskInThreads( - processing_function=process_message_threaded, - concurrency=self.num_threads, - max_pending_futures=50, - next_step=CommitOffsets(commit), - ), + return RunTaskInThreads( + processing_function=process_message, + concurrency=self.num_threads, + max_pending_futures=50, + next_step=CommitOffsets(commit), ) def shutdown(self) -> None: @@ -101,51 +89,6 @@ def shutdown(self) -> None: self.pool.close() -def initialize_threaded_context(message: Message[KafkaPayload]) -> MessageContext: - """Initialize a Sentry transaction and unpack the message.""" - # TODO-anton: remove sampled here and let traces_sampler decide - transaction = sentry_sdk.start_transaction( - name="replays.consumer.process_recording", - op="replays.consumer", - custom_sampling_context={ - "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0) - }, - ) - isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() - return MessageContext(message.payload.value, transaction, isolation_scope) - - -def process_message_threaded(message: Message[MessageContext]) -> Any: - """Move the replay payload to permanent storage.""" - context: MessageContext = message.payload - - try: - message_dict: ReplayRecording = RECORDINGS_CODEC.decode(context.message) - except ValidationError: - # TODO: DLQ - logger.exception("Could not decode recording message.") - return None - - ingest_recording(message_dict, context.transaction, context.isolation_scope) - - def process_message(message: Message[KafkaPayload]) -> Any: """Move the replay payload to permanent storage.""" - # TODO-anton: remove sampled here and let traces_sampler decide - transaction = sentry_sdk.start_transaction( - name="replays.consumer.process_recording", - op="replays.consumer", - custom_sampling_context={ - "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0) - }, - ) - isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() - - try: - message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message.payload.value) - except ValidationError: - # TODO: DLQ - logger.exception("Could not decode recording message.") - return None - - ingest_recording(message_dict, transaction, isolation_scope) + ingest_recording(message.payload.value) diff --git a/src/sentry/replays/consumers/recording_buffered.py b/src/sentry/replays/consumers/recording_buffered.py index 40c104414883a1..a4f04d49706078 100644 --- a/src/sentry/replays/consumers/recording_buffered.py +++ b/src/sentry/replays/consumers/recording_buffered.py @@ -65,7 +65,8 @@ ) from sentry.replays.usecases.ingest import ( LOG_SAMPLE_RATE, - process_headers, + DropSilently, + parse_headers, track_initial_segment_event, ) from sentry.replays.usecases.ingest.dom_index import ( @@ -211,7 +212,10 @@ def has_exceeded_last_buffer_commit_time(self) -> bool: return time.time() >= self._buffer_next_commit_time def append(self, message: BaseValue[KafkaPayload]) -> None: - process_message(self, message.payload.value) + try: + process_message(self, message.payload.value) + except DropSilently: + pass def new(self) -> RecordingBuffer: return RecordingBuffer( @@ -233,16 +237,9 @@ def process_message(buffer: RecordingBuffer, message: bytes) -> None: logger.exception("Could not decode recording message.") return None - try: - headers, compressed_segment = process_headers( - cast_payload_bytes(decoded_message["payload"]) - ) - except Exception: - # TODO: DLQ - logger.exception( - "Recording headers could not be extracted %s", decoded_message["replay_id"] - ) - return None + headers, compressed_segment = parse_headers( + cast_payload_bytes(decoded_message["payload"]), decoded_message["replay_id"] + ) # Segment is decompressed for further analysis. Packed format expects # concatenated, uncompressed bytes. diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index 04bc1dbba4b1eb..af07134025858e 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -4,14 +4,17 @@ import logging import zlib from datetime import datetime, timezone -from typing import TypedDict, cast +from typing import Any, TypedDict, cast +import sentry_sdk import sentry_sdk.scope +from django.conf import settings +from sentry_kafka_schemas.codecs import Codec, ValidationError from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording -from sentry_sdk import Scope, set_tag -from sentry_sdk.tracing import Span +from sentry_sdk import set_tag from sentry import options +from sentry.conf.types.kafka_definition import Topic, get_topic_codec from sentry.constants import DataCategory from sentry.logging.handlers import SamplingFilter from sentry.models.project import Project @@ -20,7 +23,11 @@ make_recording_filename, storage_kv, ) -from sentry.replays.usecases.ingest.dom_index import log_canvas_size, parse_and_emit_replay_actions +from sentry.replays.usecases.ingest.dom_index import ( + ReplayActionsEvent, + emit_replay_actions, + parse_replay_actions, +) from sentry.replays.usecases.pack import pack from sentry.signals import first_replay_received from sentry.utils import json, metrics @@ -29,11 +36,16 @@ CACHE_TIMEOUT = 3600 COMMIT_FREQUENCY_SEC = 1 LOG_SAMPLE_RATE = 0.01 +RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) logger = logging.getLogger("sentry.replays") logger.addFilter(SamplingFilter(LOG_SAMPLE_RATE)) +class DropSilently(Exception): + pass + + class ReplayRecordingSegment(TypedDict): id: str # a uuid that individualy identifies a recording segment chunks: int # the number of chunks for this segment @@ -70,41 +82,39 @@ class RecordingIngestMessage: replay_video: bytes | None -@metrics.wraps("replays.usecases.ingest.ingest_recording") -def ingest_recording( - message_dict: ReplayRecording, transaction: Span, isolation_scope: Scope -) -> None: +def ingest_recording(message: bytes) -> None: """Ingest non-chunked recording messages.""" + isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() + with sentry_sdk.scope.use_isolation_scope(isolation_scope): - with transaction.start_child( - op="replays.usecases.ingest.ingest_recording", - name="ingest_recording", - ): - message = RecordingIngestMessage( - replay_id=message_dict["replay_id"], - key_id=message_dict.get("key_id"), - org_id=message_dict["org_id"], - project_id=message_dict["project_id"], - received=message_dict["received"], - retention_days=message_dict["retention_days"], - payload_with_headers=cast(bytes, message_dict["payload"]), - replay_event=cast(bytes | None, message_dict.get("replay_event")), - replay_video=cast(bytes | None, message_dict.get("replay_video")), - ) - _ingest_recording(message, transaction) - - -def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> None: + sentry_sdk.start_transaction( + name="replays.consumer.process_recording", + op="replays.consumer", + custom_sampling_context={ + "sample_rate": getattr( + settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0 + ) + }, + ) + + try: + _ingest_recording(message) + except DropSilently: + # The message couldn't be parsed for whatever reason. We shouldn't block the consumer + # so we ignore it. + pass + + +@sentry_sdk.trace +def _ingest_recording(message_bytes: bytes) -> None: """Ingest recording messages.""" + message = parse_recording_message(message_bytes) + set_tag("org_id", message.org_id) set_tag("project_id", message.project_id) - try: - headers, compressed_segment = process_headers(message.payload_with_headers) - except Exception: - # TODO: DLQ - logger.exception("Recording headers could not be extracted %s", message.replay_id) - return None + headers, compressed_segment = parse_headers(message.payload_with_headers, message.replay_id) + compressed_segment, recording_segment = decompress_segment(compressed_segment) # Normalize ingest data into a standardized ingest format. segment_data = RecordingSegmentStorageMeta( @@ -114,19 +124,6 @@ def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> Non retention_days=message.retention_days, ) - # Segment is decompressed for further analysis. Packed format expects - # concatenated, uncompressed bytes. - try: - recording_segment = zlib.decompress(compressed_segment) - _report_size_metrics(len(compressed_segment), len(recording_segment)) - except zlib.error: - if compressed_segment[0] == ord("["): - recording_segment = compressed_segment - compressed_segment = zlib.compress(compressed_segment) # Save storage $$$ - else: - logger.exception("Invalid recording body.") - return None - if message.replay_video: # Logging org info for bigquery logger.info( @@ -157,7 +154,7 @@ def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> Non else: storage_kv.set(make_recording_filename(segment_data), compressed_segment) - recording_post_processor(message, headers, recording_segment, message.replay_event, transaction) + recording_post_processor(message, headers, recording_segment, message.replay_event) # The first segment records an accepted outcome. This is for billing purposes. Subsequent # segments are not billed. @@ -171,9 +168,8 @@ def _ingest_recording(message: RecordingIngestMessage, transaction: Span) -> Non is_replay_video=message.replay_video is not None, ) - transaction.finish() - +@sentry_sdk.trace def track_initial_segment_event( org_id: int, project_id: int, @@ -229,14 +225,6 @@ def should_skip_billing(org_id: int, is_replay_video: bool) -> bool: return is_replay_video and org_id in options.get("replay.replay-video.billing-skip-org-ids") -@metrics.wraps("replays.usecases.ingest.process_headers") -def process_headers(bytes_with_headers: bytes) -> tuple[RecordingSegmentHeaders, bytes]: - recording_headers_json, recording_segment = bytes_with_headers.split(b"\n", 1) - recording_headers = json.loads(recording_headers_json) - assert isinstance(recording_headers.get("segment_id"), int) - return recording_headers, recording_segment - - def replay_recording_segment_cache_id(project_id: int, replay_id: str, segment_id: str) -> str: return f"{project_id}:{replay_id}:{segment_id}" @@ -254,53 +242,18 @@ def _report_size_metrics( ) +@sentry_sdk.trace def recording_post_processor( message: RecordingIngestMessage, headers: RecordingSegmentHeaders, segment_bytes: bytes, replay_event_bytes: bytes | None, - transaction: Span, ) -> None: try: - with metrics.timer("replays.usecases.ingest.decompress_and_parse"): - parsed_segment_data = json.loads(segment_bytes) - parsed_replay_event = json.loads(replay_event_bytes) if replay_event_bytes else None - - # Emit DOM search metadata to Clickhouse. - with transaction.start_child( - op="replays.usecases.ingest.parse_and_emit_replay_actions", - name="parse_and_emit_replay_actions", - ): - project = Project.objects.get_from_cache(id=message.project_id) - parse_and_emit_replay_actions( - retention_days=message.retention_days, - project=project, - replay_id=message.replay_id, - segment_data=parsed_segment_data, - replay_event=parsed_replay_event, - org_id=message.org_id, - ) - - # Log canvas mutations to bigquery. - log_canvas_size( - message.org_id, - message.project_id, - message.replay_id, - parsed_segment_data, - ) - - # Log # of rrweb events to bigquery. - logger.info( - "sentry.replays.slow_click", - extra={ - "event_type": "rrweb_event_count", - "org_id": message.org_id, - "project_id": message.project_id, - "replay_id": message.replay_id, - "size": len(parsed_segment_data), - }, - ) - + segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes) + actions_event = try_get_replay_actions(message, segment, replay_event) + if actions_event: + emit_replay_actions(actions_event) except Exception: logging.exception( "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s", @@ -309,3 +262,75 @@ def recording_post_processor( message.replay_id, headers["segment_id"], ) + + +@sentry_sdk.trace +def parse_recording_message(message: bytes) -> RecordingIngestMessage: + try: + message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message) + except ValidationError: + logger.exception("Could not decode recording message.") + raise DropSilently() + + return RecordingIngestMessage( + replay_id=message_dict["replay_id"], + key_id=message_dict.get("key_id"), + org_id=message_dict["org_id"], + project_id=message_dict["project_id"], + received=message_dict["received"], + retention_days=message_dict["retention_days"], + payload_with_headers=cast(bytes, message_dict["payload"]), + replay_event=cast(bytes | None, message_dict.get("replay_event")), + replay_video=cast(bytes | None, message_dict.get("replay_video")), + ) + + +@sentry_sdk.trace +def parse_headers(recording: bytes, replay_id: str) -> tuple[RecordingSegmentHeaders, bytes]: + try: + recording_headers_json, recording_segment = recording.split(b"\n", 1) + recording_headers = json.loads(recording_headers_json) + assert isinstance(recording_headers.get("segment_id"), int) + return recording_headers, recording_segment + except Exception: + logger.exception("Recording headers could not be extracted %s", replay_id) + raise DropSilently() + + +@sentry_sdk.trace +def decompress_segment(segment: bytes) -> tuple[bytes, bytes]: + try: + decompressed_segment = zlib.decompress(segment) + return segment, decompressed_segment + except zlib.error: + if segment[0] == ord("["): + compressed_segment = zlib.compress(segment) + return compressed_segment, segment + else: + logger.exception("Invalid recording body.") + raise DropSilently() + + +@sentry_sdk.trace +def parse_segment_and_replay_data(segment: bytes, replay_event: bytes | None) -> tuple[Any, Any]: + parsed_segment_data = json.loads(segment) + parsed_replay_event = json.loads(replay_event) if replay_event else None + return parsed_segment_data, parsed_replay_event + + +@sentry_sdk.trace +def try_get_replay_actions( + message: RecordingIngestMessage, + parsed_segment_data: Any, + parsed_replay_event: Any | None, +) -> ReplayActionsEvent | None: + project = Project.objects.get_from_cache(id=message.project_id) + + return parse_replay_actions( + project=project, + replay_id=message.replay_id, + retention_days=message.retention_days, + segment_data=parsed_segment_data, + replay_event=parsed_replay_event, + org_id=message.org_id, + ) diff --git a/src/sentry/replays/usecases/ingest/dom_index.py b/src/sentry/replays/usecases/ingest/dom_index.py index f80eb37abac26b..27a1114fd98a9a 100644 --- a/src/sentry/replays/usecases/ingest/dom_index.py +++ b/src/sentry/replays/usecases/ingest/dom_index.py @@ -8,6 +8,8 @@ from hashlib import md5 from typing import Any, Literal, TypedDict +import sentry_sdk + from sentry import options from sentry.conf.types.kafka_definition import Topic from sentry.models.project import Project @@ -77,11 +79,13 @@ def parse_and_emit_replay_actions( emit_replay_actions(message) +@sentry_sdk.trace def emit_replay_actions(action: ReplayActionsEvent) -> None: publisher = _initialize_publisher() publisher.publish("ingest-replay-events", json.dumps(action)) +@sentry_sdk.trace def parse_replay_actions( project: Project, replay_id: str, From 55cc3afae5878c1e5621ce074daad4cb6be7de47 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Wed, 19 Feb 2025 08:15:06 -0600 Subject: [PATCH 04/49] Delete unused function --- src/sentry/replays/usecases/ingest/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index af07134025858e..699ac534bc79be 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -225,10 +225,6 @@ def should_skip_billing(org_id: int, is_replay_video: bool) -> bool: return is_replay_video and org_id in options.get("replay.replay-video.billing-skip-org-ids") -def replay_recording_segment_cache_id(project_id: int, replay_id: str, segment_id: str) -> str: - return f"{project_id}:{replay_id}:{segment_id}" - - def _report_size_metrics( size_compressed: int | None = None, size_uncompressed: int | None = None ) -> None: From a243bdbd4760c2939f08d5c15d322764288b7e9f Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Wed, 19 Feb 2025 08:16:37 -0600 Subject: [PATCH 05/49] Report size metrics --- src/sentry/replays/usecases/ingest/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index af07134025858e..8d15bfecb56b30 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -115,6 +115,7 @@ def _ingest_recording(message_bytes: bytes) -> None: headers, compressed_segment = parse_headers(message.payload_with_headers, message.replay_id) compressed_segment, recording_segment = decompress_segment(compressed_segment) + _report_size_metrics(len(compressed_segment), len(recording_segment)) # Normalize ingest data into a standardized ingest format. segment_data = RecordingSegmentStorageMeta( From 890f17027047ea6243cda9a058f8c636a946906b Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Wed, 19 Feb 2025 10:13:36 -0600 Subject: [PATCH 06/49] Separate IO from processing --- .../replays/usecases/ingest/__init__.py | 173 ++++++++++++------ 1 file changed, 120 insertions(+), 53 deletions(-) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index e72b1a81e784ae..444f590cfb1412 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -108,6 +108,30 @@ def ingest_recording(message: bytes) -> None: @sentry_sdk.trace def _ingest_recording(message_bytes: bytes) -> None: """Ingest recording messages.""" + processed_recording = process_recording_message(message_bytes) + commit_recording_message(processed_recording) + track_recording_metadata(processed_recording) + + +@dataclasses.dataclass +class ProcessedRecordingMessage: + actions_event: ReplayActionsEvent | None + filedata: bytes + filename: str + is_replay_video: bool + key_id: int | None + org_id: int + project_id: int + received: int + recording_size_uncompressed: int + recording_size: int + replay_id: str + segment_id: int + video_size: int | None + + +@sentry_sdk.trace +def process_recording_message(message_bytes: bytes) -> ProcessedRecordingMessage: message = parse_recording_message(message_bytes) set_tag("org_id", message.org_id) @@ -115,58 +139,103 @@ def _ingest_recording(message_bytes: bytes) -> None: headers, compressed_segment = parse_headers(message.payload_with_headers, message.replay_id) compressed_segment, recording_segment = decompress_segment(compressed_segment) - _report_size_metrics(len(compressed_segment), len(recording_segment)) - # Normalize ingest data into a standardized ingest format. - segment_data = RecordingSegmentStorageMeta( + actions_event = parse_recording_actions( + message, + headers, + recording_segment, + message.replay_event, + ) + + filename = make_recording_filename( + RecordingSegmentStorageMeta( + project_id=message.project_id, + replay_id=message.replay_id, + segment_id=headers["segment_id"], + retention_days=message.retention_days, + ) + ) + + if message.replay_video: + filedata = zlib.compress(pack(rrweb=recording_segment, video=message.replay_video)) + video_size = len(message.replay_video) + else: + filedata = compressed_segment + video_size = None + + return ProcessedRecordingMessage( + actions_event, + filedata, + filename, + is_replay_video=message.replay_video is not None, + key_id=message.key_id, + org_id=message.org_id, project_id=message.project_id, + received=message.received, + recording_size_uncompressed=len(recording_segment), + recording_size=len(compressed_segment), replay_id=message.replay_id, segment_id=headers["segment_id"], - retention_days=message.retention_days, + video_size=video_size, ) - if message.replay_video: + +@sentry_sdk.trace +def commit_recording_message(recording: ProcessedRecordingMessage) -> None: + # Write to GCS. + storage_kv.set(recording.filename, recording.filedata) + + # Write to billing consumer if its a billable event. + if recording.segment_id == 0: + track_initial_segment_event( + recording.org_id, + recording.project_id, + recording.replay_id, + recording.key_id, + recording.received, + is_replay_video=recording.is_replay_video, + ) + + # Write to replay-event consumer. + if recording.actions_event: + emit_replay_actions(recording.actions_event) + + +@sentry_sdk.trace +def track_recording_metadata(recording: ProcessedRecordingMessage) -> None: + # Report size metrics to determine usage patterns. + _report_size_metrics( + size_compressed=recording.recording_size, + size_uncompressed=recording.recording_size_uncompressed, + ) + + if recording.video_size: # Logging org info for bigquery logger.info( "sentry.replays.slow_click", extra={ "event_type": "mobile_event", - "org_id": message.org_id, - "project_id": message.project_id, - "size": len(message.replay_video), + "org_id": recording.org_id, + "project_id": recording.project_id, + "size": len(recording.filedata), }, ) - # Record video size for COGS analysis. + # Track the number of replay-video events we receive. metrics.incr("replays.recording_consumer.replay_video_count") + + # Record video size for COGS analysis. metrics.distribution( "replays.recording_consumer.replay_video_size", - len(message.replay_video), + recording.video_size, unit="byte", ) - dat = zlib.compress(pack(rrweb=recording_segment, video=message.replay_video)) - storage_kv.set(make_recording_filename(segment_data), dat) - - # Track combined payload size. + # Track combined payload size for COGs analysis. metrics.distribution( - "replays.recording_consumer.replay_video_event_size", len(dat), unit="byte" - ) - else: - storage_kv.set(make_recording_filename(segment_data), compressed_segment) - - recording_post_processor(message, headers, recording_segment, message.replay_event) - - # The first segment records an accepted outcome. This is for billing purposes. Subsequent - # segments are not billed. - if headers["segment_id"] == 0: - track_initial_segment_event( - message.org_id, - message.project_id, - message.replay_id, - message.key_id, - message.received, - is_replay_video=message.replay_video is not None, + "replays.recording_consumer.replay_video_event_size", + len(recording.filedata), + unit="byte", ) @@ -239,28 +308,6 @@ def _report_size_metrics( ) -@sentry_sdk.trace -def recording_post_processor( - message: RecordingIngestMessage, - headers: RecordingSegmentHeaders, - segment_bytes: bytes, - replay_event_bytes: bytes | None, -) -> None: - try: - segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes) - actions_event = try_get_replay_actions(message, segment, replay_event) - if actions_event: - emit_replay_actions(actions_event) - except Exception: - logging.exception( - "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s", - message.org_id, - message.project_id, - message.replay_id, - headers["segment_id"], - ) - - @sentry_sdk.trace def parse_recording_message(message: bytes) -> RecordingIngestMessage: try: @@ -308,6 +355,26 @@ def decompress_segment(segment: bytes) -> tuple[bytes, bytes]: raise DropSilently() +@sentry_sdk.trace +def parse_recording_actions( + message: RecordingIngestMessage, + headers: RecordingSegmentHeaders, + segment_bytes: bytes, + replay_event_bytes: bytes | None, +) -> ReplayActionsEvent | None: + try: + segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes) + return try_get_replay_actions(message, segment, replay_event) + except Exception: + logging.exception( + "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s", + message.org_id, + message.project_id, + message.replay_id, + headers["segment_id"], + ) + + @sentry_sdk.trace def parse_segment_and_replay_data(segment: bytes, replay_event: bytes | None) -> tuple[Any, Any]: parsed_segment_data = json.loads(segment) From 2dcfaecfe254716edc196c42005734ca934966ca Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Wed, 19 Feb 2025 10:53:20 -0600 Subject: [PATCH 07/49] Add explicit return --- src/sentry/replays/usecases/ingest/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index 444f590cfb1412..ab3ab8901c18b5 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -373,6 +373,7 @@ def parse_recording_actions( message.replay_id, headers["segment_id"], ) + return None @sentry_sdk.trace From db5fa11ba714e6b64d23c603a3127779102d7264 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Wed, 19 Feb 2025 15:42:56 -0600 Subject: [PATCH 08/49] Add buffer managers --- .../consumers/buffered/buffer_managers.py | 144 +++++++++ .../replays/consumers/buffered/consumer.py | 293 ++++-------------- src/sentry/replays/consumers/buffered/lib.py | 4 +- .../replays/usecases/ingest/__init__.py | 44 ++- 4 files changed, 241 insertions(+), 244 deletions(-) create mode 100644 src/sentry/replays/consumers/buffered/buffer_managers.py diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py new file mode 100644 index 00000000000000..c3504a278107a8 --- /dev/null +++ b/src/sentry/replays/consumers/buffered/buffer_managers.py @@ -0,0 +1,144 @@ +import dataclasses +import uuid +from concurrent.futures import ThreadPoolExecutor + +import sentry_sdk + +from sentry.replays.lib.storage import storage_kv +from sentry.replays.usecases.ingest import ( + ProcessedRecordingMessage, + commit_recording_message, + emit_replay_actions, + track_initial_segment_event, + track_recording_metadata, +) +from sentry.replays.usecases.ingest.dom_index import _initialize_publisher + + +@dataclasses.dataclass(frozen=True) +class FilePartRow: + key: str + range_start: int + range_stop: int + + +@dataclasses.dataclass(frozen=True) +class MergedBuffer: + buffer: bytes + buffer_rows: list[FilePartRow] + remote_key: str + + +class BatchedBufferManager: + """Batched buffer manager. + + The batched buffer manager takes a list of messages and merges them into a single file before + committing them to permanent storage. We store filename and byte locations in a PostgreSQL + table. + """ + + def commit(self, messages: list[ProcessedRecordingMessage]) -> None: + merged_buffer = self.merge_buffer(messages) + self.commit_merged_buffer(merged_buffer) + self.bulk_track_initial_segment_events(messages) + self.bulk_emit_action_events(messages) + self.bulk_track_recording_metadata(messages) + + @sentry_sdk.trace + def merge_buffer(self, buffer: list[ProcessedRecordingMessage]) -> MergedBuffer: + def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]: + range_start = len(parts) + range_stop = range_start + len(part) - 1 + return (parts + part, FilePartRow(key, range_start, range_stop)) + + parts = b"" + parts_rows = [] + remote_key = uuid.uuid4().hex + + for item in buffer: + # The recording data is appended to the buffer and a row for tracking is + # returned. + parts, part_row = _append_part(parts, item.filedata, key=item.filename) + parts_rows.append(part_row) + + return MergedBuffer( + buffer=parts, + buffer_rows=parts_rows, + remote_key=remote_key, + ) + + @sentry_sdk.trace + def commit_merged_buffer(self, buffer: MergedBuffer) -> None: + if buffer.buffer != b"": + storage_kv.set(buffer.remote_key, buffer.buffer) + # TODO: bulk insert rows + + @sentry_sdk.trace + def bulk_track_initial_segment_events(self, items: list[ProcessedRecordingMessage]) -> None: + # Each item in the buffer needs to record a billing outcome. Not every segment will bill + # but we defer that behavior to the `track_initial_segment_event` function. + for item in items: + track_initial_segment_event( + item.org_id, + item.project_id, + item.replay_id, + item.segment_id, + item.key_id, + item.received, + item.is_replay_video, + ) + + @sentry_sdk.trace + def bulk_emit_action_events(self, items: list[ProcessedRecordingMessage]) -> None: + # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The + # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed + # before we can consider this buffer fully flushed and commit our local offsets. + for item in items: + if item.actions_event: + emit_replay_actions(item.actions_event) + + # Ensure the replay-actions were committed to the broker before we commit this batch. + publisher = _initialize_publisher() + publisher.flush() + + @sentry_sdk.trace + def bulk_track_recording_metadata(items: list[ProcessedRecordingMessage]) -> None: + # Each item in the buffer needs to record metrics about itself. + for item in items: + track_recording_metadata(item) + + +class ThreadedBufferManager: + """Threaded buffer manager. + + The threaded buffer manager is the original way we uploaded files except in an application + managed thread-pool. We iterate over each processed recording, commit them in a thread, and + finally return null when all the tasks complete. It requires no changes elsewhere in the code + to accomodate. + + The goal of this class is to _not_ be clever. We want to as closely as possible immitate + current production behavior. The reason being is that a consumer refactor is a difficult task + to feature flag and we want to gradually rollout the behavior over time after assuring + ourselves that there are no defects in the `BatchedBufferManager`. + """ + + fn = commit_recording_message + max_workers = 100 + + def commit(self, messages: list[ProcessedRecordingMessage]) -> None: + # Use as many workers as necessary up to a limit. We don't want to start thousands of + # worker threads. + max_workers = min(len(messages), self.max_workers) + + # We apply whatever function is defined on the class to each message in the list. This is + # useful for testing reasons (dependency injection). + with ThreadPoolExecutor(max_workers=max_workers) as pool: + pool.map(self.fn, messages) + + # Recording metadata is not tracked in the threadpool. This is because this function will + # log. Logging will acquire a lock and make our threading less useful due to the speed of + # the I/O we do in this step. + for message in messages: + track_recording_metadata(message) + + return None diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 103864ef99089c..e692156af5ac6e 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,262 +1,93 @@ -from sentry.replays.consumers.buffered.lib import Model, buffering_runtime - -# RunTime implementation. - - -def init(flags: dict[str, str]) -> Model["Item"]: - buffer = Buffer(flags) - return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) - - -def process_message(message: bytes) -> "Item" | None: - return _process_message(message) - - -recording_consumer = buffering_runtime( - init_fn=init, - process_fn=process_message, -) - - -# Business-logic implementation. -import dataclasses -import logging import time -import uuid -import zlib -from typing import cast import sentry_sdk from django.conf import settings -from sentry_kafka_schemas.codecs import Codec, ValidationError -from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording -from sentry.conf.types.kafka_definition import Topic, get_topic_codec -from sentry.replays.lib.storage import ( - RecordingSegmentStorageMeta, - make_recording_filename, - make_video_filename, - storage_kv, +from sentry.replays.consumers.buffered.buffer_managers import ( + BatchedBufferManager, + ThreadedBufferManager, ) +from sentry.replays.consumers.buffered.lib import Model, buffering_runtime from sentry.replays.usecases.ingest import ( - RecordingIngestMessage, - _report_size_metrics, - process_headers, - recording_post_processor, - track_initial_segment_event, + DropSilently, + ProcessedRecordingMessage, + process_recording_message, ) -from sentry.replays.usecases.ingest.dom_index import ReplayActionsEvent, _initialize_publisher -from sentry.utils import json - -logger = logging.getLogger(__name__) - -RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) - - -@dataclasses.dataclass(frozen=True) -class EventMetadata: - is_replay_video: bool - key_id: int | None - org_id: int - project_id: int - received: int - replay_id: str - retention_days: int - segment_id: int - -@dataclasses.dataclass(frozen=True) -class Item: - action_events: list[ReplayActionsEvent] - key_name: str - metadata: EventMetadata - recording: bytes - video: bytes | None +class BufferManager: + """Buffer manager. -@dataclasses.dataclass(frozen=True) -class FilePartRow: - key: str - range_start: int - range_stop: int + Determines if and when a buffer should flush. The buffer accepts only one argument (`flags`) + which can contain any number of configuration options. Currently we only care about the time + the buffer has been active and the length of the buffer. But we could update this function to + extract an option thats concerned with the bytesize of the buffer if we thought that was a + useful metric for committing. - -@dataclasses.dataclass(frozen=True) -class MergedBuffer: - action_events: list[ReplayActionsEvent] - billed_events: list[EventMetadata] - buffer: bytes - buffer_rows: list[FilePartRow] - remote_key: str - - -class Buffer: + The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its + methods as callbacks to the Model. The state contained within the method's instance is implicit + and unknown to the RunTime. We could model this state inside the RunTime but the state is + simple enough that I don't feel the need to over-engineer the buffering RunTime. For more + complex use-cases you would want to formalize state transformations in the RunTime. Especially + if you wanted to expose the state across more locations in the application. + """ def __init__(self, flags: dict[str, str]) -> None: - self.max_buffer_length = int(flags["max_buffer_length"]) - self.max_buffer_wait = int(flags["max_buffer_wait"]) - self.__last_flushed_at = time.time() - - def can_flush(self) -> bool: - return (time.time() - self.max_buffer_wait) >= self.__last_flushed_at - - def do_flush(self, model: Model[Item]) -> None: - merged_buffer = _merge_buffer(model.buffer) - _commit_merged_buffer(merged_buffer) + self.__max_buffer_length = int(flags["max_buffer_length"]) + self.__max_buffer_wait = int(flags["max_buffer_wait"]) self.__last_flushed_at = time.time() - -def _merge_buffer(buffer: list[Item]) -> MergedBuffer: - action_events = [] - billed_events: list[EventMetadata] = [] - parts = b"" - parts_rows = [] - remote_key = uuid.uuid4().hex - - for item in buffer: - # Extend the action events with whatever exists on the buffer item. - action_events.extend(item.action_events) - - # Segment 0 events which are not replay-video are billed. - if item.metadata.segment_id == 0 and not item.metadata.is_replay_video: - billed_events.append(item.metadata) - - # Append the recording and video if applicable to the zipped file. - meta = RecordingSegmentStorageMeta( - project_id=item.metadata.project_id, - replay_id=item.metadata.replay_id, - segment_id=item.metadata.segment_id, - retention_days=item.metadata.retention_days, + def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: + return ( + len(model.buffer) >= self.__max_buffer_length + or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at ) - parts, part_row = _append_part(parts, item.recording, key=make_recording_filename(meta)) - parts_rows.append(part_row) - - if item.video: - parts, part_row = _append_part(parts, item.video, key=make_video_filename(meta)) - parts_rows.append(part_row) - - return MergedBuffer( - action_events=action_events, - billed_events=billed_events, - buffer=parts, - buffer_rows=parts_rows, - remote_key=remote_key, - ) - - -def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]: - range_start = len(parts) - range_stop = range_start + len(part) - 1 - return (parts + part, FilePartRow(key, range_start, range_stop)) - - -def _commit_merged_buffer(buffer: MergedBuffer) -> None: - # Empty buffer's are not committed. - if buffer.buffer == b"": - return None - - # Upload recording. - storage_kv.set(buffer.remote_key, buffer.buffer) - - # Write rows. - # TODO: bulk insert rows - - # Emit billing. - for event in buffer.billed_events: - track_initial_segment_event( - event.org_id, - event.project_id, - event.replay_id, - event.key_id, - event.received, - event.is_replay_video, - ) + def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: + # During the transitionary period most organizations will commit following a traditional + # commit pattern... + threaded = list(filter(lambda i: i.org_id != 1, model.buffer)) + threaded_buffer = ThreadedBufferManager() + threaded_buffer.commit(threaded) - # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The - # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed - # before we can consider this buffer fully flushed and commit our local offsets. - publisher = _initialize_publisher(asynchronous=True) - for event in buffer.action_events: - publisher.publish("ingest-replay-events", json.dumps(event)) - publisher.flush() + # ...But others will be opted in to the batched version of uploading. This will give us a + # chance to test the feature with as minimal risk as possible. + batched = list(filter(lambda i: i.org_id == 1, model.buffer)) + batched_buffer = BatchedBufferManager() + batched_buffer.commit(batched) - return None + # Update the buffer manager with the new time so we don't continuously commit in a loop! + self.__last_flushed_at = time.time() -def _process_message(message: bytes) -> Item | None: - transaction = sentry_sdk.start_transaction( - name="replays.consumer.process_recording", - op="replays.consumer", - custom_sampling_context={ - "sample_rate": getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0) - }, - ) +def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: + buffer = BufferManager(flags) + return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) - with transaction.start_child( - op="replays.consumers.buffered.process_message", name="ingest_recording" - ): - # set_tag("org_id", message.org_id) - # set_tag("project_id", message.project_id) - try: - message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message.payload.value) - except ValidationError: - logger.exception("Could not decode recording message.") - return None +def process_message(message: bytes) -> ProcessedRecordingMessage | None: + isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() - message = RecordingIngestMessage( - replay_id=message_dict["replay_id"], - key_id=message_dict.get("key_id"), - org_id=message_dict["org_id"], - project_id=message_dict["project_id"], - received=message_dict["received"], - retention_days=message_dict["retention_days"], - payload_with_headers=cast(bytes, message_dict["payload"]), - replay_event=cast(bytes | None, message_dict.get("replay_event")), - replay_video=cast(bytes | None, message_dict.get("replay_video")), + with sentry_sdk.scope.use_isolation_scope(isolation_scope): + transaction = sentry_sdk.start_transaction( + name="replays.consumer.process_recording", + op="replays.consumer", + custom_sampling_context={ + "sample_rate": getattr( + settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0 + ) + }, ) try: - headers, compressed_segment = process_headers(message.payload_with_headers) - except Exception: - logger.exception("Recording headers could not be extracted %s", message.replay_id) + return process_recording_message(message) + except DropSilently: return None + finally: + transaction.finish() - # Segment is decompressed for further analysis. Packed format expects - # concatenated, uncompressed bytes. - try: - recording_segment = zlib.decompress(compressed_segment) - _report_size_metrics(len(compressed_segment), len(recording_segment)) - except zlib.error: - if compressed_segment[0] == ord("["): - recording_segment = compressed_segment - compressed_segment = zlib.compress(compressed_segment) # Save storage $$$ - else: - logger.exception("Invalid recording body.") - return None - - recording_post_processor( - message, headers, recording_segment, message.replay_event, transaction - ) - -@sentry_sdk.trace -def _decode_recording_message(message: bytes) -> RecordingIngestMessage | None: - try: - message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message) - except ValidationError: - logger.exception("Could not decode recording message.") - return None - - return RecordingIngestMessage( - replay_id=message_dict["replay_id"], - key_id=message_dict.get("key_id"), - org_id=message_dict["org_id"], - project_id=message_dict["project_id"], - received=message_dict["received"], - retention_days=message_dict["retention_days"], - payload_with_headers=cast(bytes, message_dict["payload"]), - replay_event=cast(bytes | None, message_dict.get("replay_event")), - replay_video=cast(bytes | None, message_dict.get("replay_video")), - ) +recording_consumer = buffering_runtime( + init_fn=init, + process_fn=process_message, +) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index c6e52a7b29979b..da21bbe041ebd4 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -16,8 +16,8 @@ @dataclass class Model(Generic[Item]): buffer: list[Item] - can_flush: Callable[[], bool] - do_flush: Callable[[], None] + can_flush: Callable[["Model[Item]"], bool] + do_flush: Callable[["Model[Item]"], None] offsets: MutableMapping[Partition, int] diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index ab3ab8901c18b5..250fbbee3860be 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -7,6 +7,7 @@ from typing import Any, TypedDict, cast import sentry_sdk +import sentry_sdk.consts import sentry_sdk.scope from django.conf import settings from sentry_kafka_schemas.codecs import Codec, ValidationError @@ -186,15 +187,15 @@ def commit_recording_message(recording: ProcessedRecordingMessage) -> None: storage_kv.set(recording.filename, recording.filedata) # Write to billing consumer if its a billable event. - if recording.segment_id == 0: - track_initial_segment_event( - recording.org_id, - recording.project_id, - recording.replay_id, - recording.key_id, - recording.received, - is_replay_video=recording.is_replay_video, - ) + track_initial_segment_event( + recording.org_id, + recording.project_id, + recording.replay_id, + recording.segment_id, + recording.key_id, + recording.received, + is_replay_video=recording.is_replay_video, + ) # Write to replay-event consumer. if recording.actions_event: @@ -239,11 +240,32 @@ def track_recording_metadata(recording: ProcessedRecordingMessage) -> None: ) -@sentry_sdk.trace def track_initial_segment_event( org_id: int, project_id: int, - replay_id, + replay_id: str, + segment_id: int, + key_id: int | None, + received: int, + is_replay_video: bool, +) -> None: + # Only segment 0 is billed. + if segment_id == 0: + _track_initial_segment_event( + org_id, + project_id, + replay_id, + key_id, + received, + is_replay_video, + ) + + +@sentry_sdk.trace +def _track_initial_segment_event( + org_id: int, + project_id: int, + replay_id: str, key_id: int | None, received: int, is_replay_video: bool, From 7a5a98c6d455da761f7c0a4a854fa70864954bcd Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 20 Feb 2025 12:58:49 -0600 Subject: [PATCH 09/49] Write FilePart rows and adopt a new subscription model --- src/sentry/options/defaults.py | 6 ++ .../consumers/buffered/buffer_managers.py | 55 ++++++++--- .../replays/consumers/buffered/consumer.py | 26 +++-- src/sentry/replays/consumers/buffered/lib.py | 61 ++++++++---- .../replays/consumers/buffered/platform.py | 94 ++++++++++++++++--- src/sentry/replays/models.py | 15 +++ 6 files changed, 205 insertions(+), 52 deletions(-) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 4796fa16a0f7ad..ba016021ccf80a 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -456,6 +456,12 @@ default=None, flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE, ) +register( + "replay.consumer.use-file-batching", + type=Sequence, + default=[], + flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE, +) # Globally disables replay-video. register( "replay.replay-video.disabled", diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py index c3504a278107a8..c017b7ebb40628 100644 --- a/src/sentry/replays/consumers/buffered/buffer_managers.py +++ b/src/sentry/replays/consumers/buffered/buffer_managers.py @@ -1,18 +1,18 @@ import dataclasses import uuid -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait import sentry_sdk from sentry.replays.lib.storage import storage_kv +from sentry.replays.models import FilePart from sentry.replays.usecases.ingest import ( ProcessedRecordingMessage, commit_recording_message, - emit_replay_actions, track_initial_segment_event, track_recording_metadata, ) -from sentry.replays.usecases.ingest.dom_index import _initialize_publisher +from sentry.replays.usecases.ingest.dom_index import _initialize_publisher, emit_replay_actions @dataclasses.dataclass(frozen=True) @@ -26,7 +26,7 @@ class FilePartRow: class MergedBuffer: buffer: bytes buffer_rows: list[FilePartRow] - remote_key: str + filename: str class BatchedBufferManager: @@ -37,6 +37,7 @@ class BatchedBufferManager: table. """ + @sentry_sdk.trace def commit(self, messages: list[ProcessedRecordingMessage]) -> None: merged_buffer = self.merge_buffer(messages) self.commit_merged_buffer(merged_buffer) @@ -53,7 +54,6 @@ def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRo parts = b"" parts_rows = [] - remote_key = uuid.uuid4().hex for item in buffer: # The recording data is appended to the buffer and a row for tracking is @@ -64,14 +64,28 @@ def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRo return MergedBuffer( buffer=parts, buffer_rows=parts_rows, - remote_key=remote_key, + filename=f"90/{uuid.uuid4().hex}", ) @sentry_sdk.trace def commit_merged_buffer(self, buffer: MergedBuffer) -> None: - if buffer.buffer != b"": - storage_kv.set(buffer.remote_key, buffer.buffer) - # TODO: bulk insert rows + if buffer.buffer == b"": + return None + + # Storage goes first. Headers go second. If we were to store the headers first we could + # expose a record that does not exist to the user which would ultimately 404. + storage_kv.set(buffer.filename, buffer.buffer) + + # Bulk insert the filepart. + FilePart.objects.bulk_create( + FilePart( + filename=buffer.filename, + key=row.key, + range_start=row.range_start, + range_stop=row.range_stop, + ) + for row in buffer.buffer_rows + ) @sentry_sdk.trace def bulk_track_initial_segment_events(self, items: list[ProcessedRecordingMessage]) -> None: @@ -102,7 +116,7 @@ def bulk_emit_action_events(self, items: list[ProcessedRecordingMessage]) -> Non publisher.flush() @sentry_sdk.trace - def bulk_track_recording_metadata(items: list[ProcessedRecordingMessage]) -> None: + def bulk_track_recording_metadata(self, items: list[ProcessedRecordingMessage]) -> None: # Each item in the buffer needs to record metrics about itself. for item in items: track_recording_metadata(item) @@ -122,18 +136,29 @@ class ThreadedBufferManager: ourselves that there are no defects in the `BatchedBufferManager`. """ - fn = commit_recording_message - max_workers = 100 + def __init__(self) -> None: + self.fn = commit_recording_message + self.max_workers = 100 + @sentry_sdk.trace def commit(self, messages: list[ProcessedRecordingMessage]) -> None: # Use as many workers as necessary up to a limit. We don't want to start thousands of # worker threads. max_workers = min(len(messages), self.max_workers) - # We apply whatever function is defined on the class to each message in the list. This is - # useful for testing reasons (dependency injection). with ThreadPoolExecutor(max_workers=max_workers) as pool: - pool.map(self.fn, messages) + # We apply whatever function is defined on the class to each message in the list. This + # is useful for testing reasons (dependency injection). + futures = [pool.submit(self.fn, message) for message in messages] + + # Tasks can fail. We check the done set for any failures. We will wait for all the + # futures to complete before running this step or eagerly run this step if any task + # errors. + done, _ = wait(futures, return_when=FIRST_EXCEPTION) + for future in done: + exc = future.exception() + if exc is not None: + raise exc # Recording metadata is not tracked in the threadpool. This is because this function will # log. Logging will acquire a lock and make our threading less useful due to the speed of diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index e692156af5ac6e..7b2c738fae5fe4 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -3,6 +3,7 @@ import sentry_sdk from django.conf import settings +from sentry import options from sentry.replays.consumers.buffered.buffer_managers import ( BatchedBufferManager, ThreadedBufferManager, @@ -43,16 +44,21 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at ) + @sentry_sdk.trace def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: # During the transitionary period most organizations will commit following a traditional # commit pattern... - threaded = list(filter(lambda i: i.org_id != 1, model.buffer)) + # + # TODO: Would be nice to cache this value. + org_ids = set(options.get("replay.consumer.use-file-batching")) + + threaded = list(filter(lambda i: i.org_id not in org_ids, model.buffer)) threaded_buffer = ThreadedBufferManager() threaded_buffer.commit(threaded) # ...But others will be opted in to the batched version of uploading. This will give us a # chance to test the feature with as minimal risk as possible. - batched = list(filter(lambda i: i.org_id == 1, model.buffer)) + batched = list(filter(lambda i: i.org_id in org_ids, model.buffer)) batched_buffer = BatchedBufferManager() batched_buffer.commit(batched) @@ -60,12 +66,12 @@ def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: self.__last_flushed_at = time.time() -def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: - buffer = BufferManager(flags) - return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) - - def process_message(message: bytes) -> ProcessedRecordingMessage | None: + """Message processing function. + + Accepts an unstructured type and returns a structured one. Other than tracing the goal is to + have no I/O here. We'll commit the I/O on flush. + """ isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() with sentry_sdk.scope.use_isolation_scope(isolation_scope): @@ -87,6 +93,12 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None: transaction.finish() +def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: + """Return the initial state of the application.""" + buffer = BufferManager(flags) + return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) + + recording_consumer = buffering_runtime( init_fn=init, process_fn=process_message, diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index da21bbe041ebd4..ff89587a54ce7b 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -7,7 +7,16 @@ from arroyo.types import Partition -from sentry.replays.consumers.buffered.platform import Cmd, Commit, Nothing, RunTime, Task +from sentry.replays.consumers.buffered.platform import ( + Cmd, + Commit, + Join, + Nothing, + Poll, + RunTime, + Sub, + Task, +) Item = TypeVar("Item") T = TypeVar("T") @@ -21,24 +30,36 @@ class Model(Generic[Item]): offsets: MutableMapping[Partition, int] -class Msg(Generic[Item]): - pass - - @dataclass(frozen=True) -class Append(Msg[Item]): +class Append(Generic[Item]): + """Append the item to the buffer and update the offsets.""" + item: Item offset: MutableMapping[Partition, int] -class Committed(Msg[Item]): +class Committed: + """The platform committed offsets. Our buffer is now completely done.""" + pass -class Flush(Msg[Item]): +class Flush: + """Our application hit the flush threshold and has been instructed to flush.""" + + pass + + +class Polled: + """Our application was polled by the platform.""" + pass +# A "Msg" is the union of all application messages our RunTime will accept. +Msg = Append[Item] | Committed | Flush | Polled + + def process( process_fn: Callable[[bytes], Item | None], model: Model[Item], @@ -48,6 +69,8 @@ def process( item = process_fn(message) if item: return Append(item=item, offset=offset) + else: + return None def init( @@ -62,7 +85,7 @@ def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Ite case Append(item=item, offset=offset): model.buffer.append(item) model.offsets.update(offset) - if model.can_flush(): + if model.can_flush(model): return (model, Task(msg=Flush())) else: return (model, None) @@ -72,21 +95,23 @@ def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Ite # and provide error handling facilities or we could accept that this problem gets too # complicated to reasonably abstract and have developers implement their own buffering # consumer. - model.do_flush() + model.do_flush(model) model.buffer = [] return (model, Commit(msg=Committed(), offsets=model.offsets)) case Committed(): return (model, None) - - # Satisfy mypy. Apparently we don't do exhaustiveness checks in our configuration. - return (model, None) + case Polled(): + if model.can_flush(model): + return (model, Task(msg=Flush())) + else: + return (model, None) -def subscription(model: Model[Item]) -> Msg[Item] | None: - if model.can_flush(): - return Flush() - else: - return None +def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]: + return [ + Join(msg=Flush()), + Poll(msg=Polled()), + ] def buffering_runtime( diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index a1c93dd9156199..252377c3f3b2a2 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -52,6 +52,7 @@ def submit(self, message: Message[KafkaPayload]) -> None: def poll(self) -> None: assert not self.__closed + self.__runtime.publish("poll") self.__commit_step.poll() def close(self) -> None: @@ -63,6 +64,8 @@ def terminate(self) -> None: self.__commit_step.terminate() def join(self, timeout: float | None = None) -> None: + self.__runtime.publish("join") + self.__commit_step.close() self.__commit_step.join(timeout) @@ -71,37 +74,81 @@ def join(self, timeout: float | None = None) -> None: Msg = TypeVar("Msg") -class Cmd(Generic[Msg]): - pass +class BackPressure: + """Instructs the RunTime to back-pressure the platform. + Does not accept a `msg` argument as we do not expect the platform to fail. + """ -@dataclass(frozen=True) -class BackPressure(Cmd[Msg]): pass @dataclass(frozen=True) -class Commit(Cmd[Msg]): +class Commit(Generic[Msg]): + """Instructs the RunTime to commit the platform. + + Because the RunTime is based on a Kafka platform there is an expectation that the application + should trigger commits when necessary. While the application can't be sure its running Kafka + it can be sure its running on a Platform that requires offsets be committed. + """ + msg: Msg offsets: MutableMapping[Partition, int] -class Nothing(Cmd[Msg]): +class Nothing: + """Instructs the RunTime to do nothing. Equivalent to a null command.""" + pass @dataclass(frozen=True) -class Task(Cmd[Msg]): +class Task(Generic[Msg]): + """Instructs the RunTime to emit an application message back to the application.""" + msg: Msg +# A "Cmd" is the union of all the commands an application can issue to the RunTime. +Cmd = BackPressure | Commit[Msg] | Nothing | Task[Msg] + + +@dataclass(frozen=True) +class Join(Generic[Msg]): + """Join subscription class. + + The platform may need to quit. When this happens the RunTime needs to know. The application + may or may not need to know. The Join subscription allows aplications to subscribe to join + events and handle them in the way they see fit. + """ + + msg: Msg + name = "join" + + +@dataclass(frozen=True) +class Poll(Generic[Msg]): + """Poll subscription class. + + The platform will periodically poll the RunTime. The application may or may not subscribe to + these events and choose to act on them. + """ + + msg: Msg + name = "poll" + + +# A "Sub" is the union of all the events an application can subscribe to. +Sub = Join[Msg] | Poll[Msg] + + class RunTime(Generic[Model, Msg]): def __init__( self, init: Callable[[dict[str, str]], tuple[Model, Cmd[Msg] | None]], process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None], - subscription: Callable[[Model], Msg | None], + subscription: Callable[[Model], list[Sub[Msg]]], update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]], ) -> None: self.init = init @@ -111,6 +158,7 @@ def __init__( self.__commit: CommitProtocol | None = None self.__model: Model | None = None + self.__subscriptions: dict[str, Sub[Msg]] = {} @property def commit(self) -> CommitProtocol: @@ -122,19 +170,37 @@ def model(self) -> Model: assert self.__model is not None return self.__model - def poll(self) -> None: - self.__handle_msg(self.subscription(self.model)) - def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None: self.__commit = commit model, cmd = self.init(flags) self.__model = model self.__handle_cmd(cmd) + self.__register_subscriptions() def submit(self, message: Message[KafkaPayload]) -> None: self.__handle_msg(self.process(self.model, message.payload.value, message.committable)) + def publish(self, sub_name: str) -> None: + # For each new subscription event we re-register the subscribers in case anything within + # the application has changed. I.e. the model is in some new state and that means we care + # about a new subscription or don't care about an old one. + self.__register_subscriptions() + + # Using the subscription's name look for the subscription in the registry. + sub = self.__subscriptions.get(sub_name) + if sub is None: + return None + + # Match of the various subscription types and emit a mesasge to the RunTime. Right now + # there's no need to match. The name disambiguates enough already but in the future more + # subscriptions might do more complex things. + match sub: + case Join(msg=msg): + return self.__handle_msg(msg) + case Poll(msg=msg): + return self.__handle_msg(msg) + def __handle_msg(self, msg: Msg | None) -> None: if msg: model, cmd = self.update(self.model, msg) @@ -148,7 +214,7 @@ def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None: match cmd: case BackPressure(): # TODO - ... + return None case Commit(msg=msg, offsets=offsets): self.commit(offsets) return self.__handle_msg(msg) @@ -156,3 +222,7 @@ def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None: return None case Task(msg=msg): return self.__handle_msg(msg) + + def __register_subscriptions(self) -> None: + for sub in self.subscription(self.model): + self.__subscriptions[sub.name] = sub diff --git a/src/sentry/replays/models.py b/src/sentry/replays/models.py index 1d4099d0155728..b20cb45efe928e 100644 --- a/src/sentry/replays/models.py +++ b/src/sentry/replays/models.py @@ -46,3 +46,18 @@ def delete(self, *args, **kwargs): rv = super().delete(*args, **kwargs) return rv + + +@region_silo_model +class FilePart(Model): + __relocation_scope__ = RelocationScope.Excluded + + date_added = models.DateTimeField(default=timezone.now, null=False) + filename = models.CharField(null=False) + key = models.CharField(db_index=True, null=False) + range_start = models.IntegerField(null=False) + range_stop = models.IntegerField(null=False) + + class Meta: + app_label = "replays" + db_table = "replays_filepart" From 44c899bb70218382a85837002685c7f4bf4b8f12 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 20 Feb 2025 14:52:15 -0600 Subject: [PATCH 10/49] Add unit tests --- .../sentry/replays/unit/consumers/__init__.py | 0 .../unit/consumers/test_buffering_runtime.py | 108 ++++++++++++++++++ .../replays/unit/consumers/test_helpers.py | 27 +++++ .../replays/unit/consumers/test_recording.py | 33 ++++++ .../replays/unit/consumers/test_runtime.py | 84 ++++++++++++++ 5 files changed, 252 insertions(+) create mode 100644 tests/sentry/replays/unit/consumers/__init__.py create mode 100644 tests/sentry/replays/unit/consumers/test_buffering_runtime.py create mode 100644 tests/sentry/replays/unit/consumers/test_helpers.py create mode 100644 tests/sentry/replays/unit/consumers/test_recording.py create mode 100644 tests/sentry/replays/unit/consumers/test_runtime.py diff --git a/tests/sentry/replays/unit/consumers/__init__.py b/tests/sentry/replays/unit/consumers/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py new file mode 100644 index 00000000000000..a0c5f267991604 --- /dev/null +++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py @@ -0,0 +1,108 @@ +from arroyo.types import Partition, Topic + +from sentry.replays.consumers.buffered.lib import Model, buffering_runtime +from tests.sentry.replays.unit.consumers.helpers import MockCommit, MockSink, make_kafka_message + + +def buffer_runtime(sink): + return buffering_runtime( + init_fn=lambda _: Model( + buffer=[], + can_flush=lambda m: len(m.buffer) == 5, + do_flush=lambda m: sink.accept(m.buffer), + offsets={}, + ), + process_fn=lambda m: int(m) if int(m) < 5 else None, + ) + + +def test_runtime_setup(): + runtime = buffer_runtime(MockSink()) + runtime.setup({}, commit=MockCommit()) + assert runtime.model.buffer == [] + assert runtime.model.offsets == {} + + +def test_buffering_runtime_submit(): + mock_commit = MockCommit() + sink = MockSink() + + runtime = buffer_runtime(sink) + runtime.setup({}, mock_commit) + assert runtime.model.buffer == [] + assert runtime.model.offsets == {} + assert sink.accepted == [] + assert mock_commit.commit == {} + + # Assert three valid messages buffered. + runtime.submit(make_kafka_message(b"1")) + runtime.submit(make_kafka_message(b"1")) + runtime.submit(make_kafka_message(b"1")) + assert runtime.model.buffer == [1, 1, 1] + assert sink.accepted == [] + assert mock_commit.commit == {} + + # Assert two invalid messages not buffered. + runtime.submit(make_kafka_message(b"5")) + runtime.submit(make_kafka_message(b"5")) + assert runtime.model.buffer == [1, 1, 1] + assert sink.accepted == [] + assert mock_commit.commit == {} + + # Assert an additional message is buffered. + runtime.submit(make_kafka_message(b"2")) + assert runtime.model.buffer == [1, 1, 1, 2] + assert sink.accepted == [] + assert mock_commit.commit == {} + + # Assert the buffer is flushed to the sink. + runtime.submit(make_kafka_message(b"3")) + assert runtime.model.buffer == [] + assert sink.accepted == [1, 1, 1, 2, 3] + assert mock_commit.commit == {Partition(Topic("a"), 1): 2} + + +def test_buffering_runtime_publish(): + mock_commit = MockCommit() + sink = MockSink() + + runtime = buffer_runtime(sink) + runtime.setup({}, mock_commit) + assert runtime.model.buffer == [] + assert runtime.model.offsets == {} + assert sink.accepted == [] + + # Assert three valid messages buffered. + runtime.submit(make_kafka_message(b"1")) + runtime.submit(make_kafka_message(b"1")) + runtime.submit(make_kafka_message(b"1")) + assert runtime.model.buffer == [1, 1, 1] + assert sink.accepted == [] + + # Assert join subscriptions eagerly flush the buffer. + runtime.publish("join") + assert runtime.model.buffer == [] + assert sink.accepted == [1, 1, 1] + assert mock_commit.commit == {Partition(Topic("a"), 1): 2} + + # Reset the mocks. + mock_commit.commit = {} + sink.accepted = [] + + # Assert poll does not automatically empty the buffer. + runtime.submit(make_kafka_message(b"1")) + runtime.publish("poll") + assert runtime.model.buffer == [1] + assert sink.accepted == [] + + # Dirty mutation warning! We're forcing ourselves to be in a flushable state by mutating the + # state outside the explicit state machine interfaces. + runtime.model.buffer = [1, 2, 3, 4, 5] + assert runtime.model.buffer == [1, 2, 3, 4, 5] + assert sink.accepted == [] + + # Assert poll flushes if the buffer is ready. + runtime.publish("poll") + assert runtime.model.buffer == [] + assert sink.accepted == [1, 2, 3, 4, 5] + assert mock_commit.commit == {Partition(Topic("a"), 1): 2} diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py new file mode 100644 index 00000000000000..1f4eecbc060674 --- /dev/null +++ b/tests/sentry/replays/unit/consumers/test_helpers.py @@ -0,0 +1,27 @@ +from collections.abc import Mapping +from datetime import datetime + +from arroyo.backends.kafka import KafkaPayload +from arroyo.types import BrokerValue, Message, Partition, Topic + + +class MockCommit: + def __init__(self): + self.commit = {} + + def __call__(self, offsets: Mapping[Partition, int], force: bool = False) -> None: + self.commit.update(offsets) + + +class MockSink: + def __init__(self): + self.accepted = [] + + def accept(self, buffer): + self.accepted.extend(buffer) + + +def make_kafka_message(message: bytes): + return Message( + BrokerValue(KafkaPayload(b"k", message, []), Partition(Topic("a"), 1), 1, datetime.now()) + ) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py new file mode 100644 index 00000000000000..68f39fa70bf917 --- /dev/null +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -0,0 +1,33 @@ +import time +import uuid + +import msgpack + +from sentry.replays.consumers.buffered.consumer import process_message + +# from sentry.replays.usecases.ingest import ProcessedRecordingMessage + + +def test_process_message_invalid(): + result = process_message(msgpack.packb(b"hello, world!")) + assert result is None + + +def test_process_message(): + result = process_message( + msgpack.packb( + { + "type": "replay_recording_not_chunked", + "replay_id": uuid.uuid4().hex, + "org_id": 1, + "key_id": 3, + "project_id": 2, + "received": int(time.time()), + "retention_days": 30, + "payload": b'{"segment_id":0}\n[]', + "replay_event": None, + "replay_video": None, + } + ) + ) + assert result is None diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py new file mode 100644 index 00000000000000..07c26ae0e4e7f7 --- /dev/null +++ b/tests/sentry/replays/unit/consumers/test_runtime.py @@ -0,0 +1,84 @@ +from sentry.replays.consumers.buffered.platform import Join, Poll, RunTime +from tests.sentry.replays.unit.consumers.helpers import MockCommit, make_kafka_message + + +def counter_runtime() -> RunTime[int, str]: + def init(_): + return (22, None) + + def process(_model, message, _offsets): + if message == b"incr": + return "incr" + elif message == b"decr": + return "decr" + else: + return None + + def update(model, msg): + if msg == "incr": + return (model + 1, None) + elif msg == "decr": + return (model - 1, None) + elif msg == "join": + return (-10, None) + elif msg == "poll": + return (99, None) + else: + raise ValueError("Unknown msg") + + def subscription(_): + return [ + Join(msg="join"), + Poll(msg="poll"), + ] + + return RunTime( + init=init, + process=process, + update=update, + subscription=subscription, + ) + + +def test_runtime_setup(): + runtime = counter_runtime() + runtime.setup({}, commit=MockCommit()) + assert runtime.model == 22 + + +def test_runtime_submit(): + # RunTime defaults to a start point of 22. + runtime = counter_runtime() + runtime.setup({}, commit=MockCommit()) + assert runtime.model == 22 + + # Two incr commands increase the count by 2. + runtime.submit(make_kafka_message(b"incr")) + runtime.submit(make_kafka_message(b"incr")) + assert runtime.model == 24 + + # Four decr commands decrease the count by 4. + runtime.submit(make_kafka_message(b"decr")) + runtime.submit(make_kafka_message(b"decr")) + runtime.submit(make_kafka_message(b"decr")) + runtime.submit(make_kafka_message(b"decr")) + assert runtime.model == 20 + + # Messages which the application does not understand do nothing to the model. + runtime.submit(make_kafka_message(b"other")) + assert runtime.model == 20 + + +def test_runtime_publish(): + # RunTime defaults to a start point of 22. + runtime = counter_runtime() + runtime.setup({}, commit=MockCommit()) + assert runtime.model == 22 + + # A join event updates the model and sets it to -10. + runtime.publish("join") + assert runtime.model == -10 + + # A poll event updates the model and sets it to 99. + runtime.publish("poll") + assert runtime.model == 99 From de76ad0c585518df38599bd6b583e94c628f920b Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 20 Feb 2025 14:52:49 -0600 Subject: [PATCH 11/49] Add contextual errors --- .../replays/consumers/buffered/consumer.py | 2 +- src/sentry/replays/usecases/ingest/__init__.py | 18 +++++++++++++++--- .../unit/consumers/test_buffering_runtime.py | 6 +++++- .../replays/unit/consumers/test_runtime.py | 2 +- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 7b2c738fae5fe4..d8709b4ef354a5 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -99,7 +99,7 @@ def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) -recording_consumer = buffering_runtime( +recording_runtime = buffering_runtime( init_fn=init, process_fn=process_message, ) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index 250fbbee3860be..03dc3b565aa617 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -47,6 +47,18 @@ class DropSilently(Exception): pass +class CouldNotDecode(DropSilently): + pass + + +class CouldNotParseHeaders(DropSilently): + pass + + +class CouldNotDecompress(DropSilently): + pass + + class ReplayRecordingSegment(TypedDict): id: str # a uuid that individualy identifies a recording segment chunks: int # the number of chunks for this segment @@ -336,7 +348,7 @@ def parse_recording_message(message: bytes) -> RecordingIngestMessage: message_dict: ReplayRecording = RECORDINGS_CODEC.decode(message) except ValidationError: logger.exception("Could not decode recording message.") - raise DropSilently() + raise CouldNotDecode() return RecordingIngestMessage( replay_id=message_dict["replay_id"], @@ -360,7 +372,7 @@ def parse_headers(recording: bytes, replay_id: str) -> tuple[RecordingSegmentHea return recording_headers, recording_segment except Exception: logger.exception("Recording headers could not be extracted %s", replay_id) - raise DropSilently() + raise CouldNotParseHeaders() @sentry_sdk.trace @@ -374,7 +386,7 @@ def decompress_segment(segment: bytes) -> tuple[bytes, bytes]: return compressed_segment, segment else: logger.exception("Invalid recording body.") - raise DropSilently() + raise CouldNotDecompress() @sentry_sdk.trace diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py index a0c5f267991604..3ee061e764f3bb 100644 --- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py @@ -1,7 +1,11 @@ from arroyo.types import Partition, Topic from sentry.replays.consumers.buffered.lib import Model, buffering_runtime -from tests.sentry.replays.unit.consumers.helpers import MockCommit, MockSink, make_kafka_message +from tests.sentry.replays.unit.consumers.test_helpers import ( + MockCommit, + MockSink, + make_kafka_message, +) def buffer_runtime(sink): diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py index 07c26ae0e4e7f7..179d5cf5760471 100644 --- a/tests/sentry/replays/unit/consumers/test_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_runtime.py @@ -1,5 +1,5 @@ from sentry.replays.consumers.buffered.platform import Join, Poll, RunTime -from tests.sentry.replays.unit.consumers.helpers import MockCommit, make_kafka_message +from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message def counter_runtime() -> RunTime[int, str]: From abf22cf92f91b1dba1578e872239284e9491dbc2 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 20 Feb 2025 19:29:31 -0600 Subject: [PATCH 12/49] Misc test updates --- .../replays/unit/consumers/test_recording.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 68f39fa70bf917..22bb3701c74f7a 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -4,8 +4,7 @@ import msgpack from sentry.replays.consumers.buffered.consumer import process_message - -# from sentry.replays.usecases.ingest import ProcessedRecordingMessage +from sentry.replays.usecases.ingest import ProcessedRecordingMessage def test_process_message_invalid(): @@ -30,4 +29,19 @@ def test_process_message(): } ) ) - assert result is None + assert result is not None + assert result == ProcessedRecordingMessage( + actions_event=[], + filedata=b"[]", + filename=result.filename, + is_replay_video=False, + key_id=3, + org_id=1, + project_id=2, + received=result.received, + recording_size_uncompressed=2, + recording_size=result.recording_size, + replay_id=result.replay_id, + segment_id=0, + video_size=None, + ) From 2e16a0111b451e0439b8b4e73bdf847cd4504909 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 24 Feb 2025 13:51:39 -0600 Subject: [PATCH 13/49] Fully separate compute and io within the recording consumer --- .../replays/usecases/ingest/__init__.py | 203 +++++++++++++++++- 1 file changed, 196 insertions(+), 7 deletions(-) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index 28180ce7118b75..d975120052f187 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -89,7 +89,7 @@ class RecordingIngestMessage: replay_video: bytes | None -def ingest_recording(message: bytes) -> None: +def ingest_recording(message_bytes: bytes) -> None: """Ingest non-chunked recording messages.""" isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() @@ -105,7 +105,11 @@ def ingest_recording(message: bytes) -> None: ) try: - _ingest_recording(message) + message = parse_recording_message(message_bytes) + if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"): + _ingest_recording_separated_io_compute(message) + else: + _ingest_recording(message) except DropSilently: # The message couldn't be parsed for whatever reason. We shouldn't block the consumer # so we ignore it. @@ -115,10 +119,8 @@ def ingest_recording(message: bytes) -> None: @sentry_sdk.trace -def _ingest_recording(message_bytes: bytes) -> None: +def _ingest_recording(message: RecordingIngestMessage) -> None: """Ingest recording messages.""" - message = parse_recording_message(message_bytes) - set_tag("org_id", message.org_id) set_tag("project_id", message.project_id) @@ -190,6 +192,9 @@ def track_initial_segment_event( ) -> None: try: project = Project.objects.get_from_cache(id=project_id) + return _track_initial_segment_event( + org_id, project, replay_id, key_id, received, is_replay_video + ) except Project.DoesNotExist: logger.warning( "Recording segment was received for a project that does not exist.", @@ -200,6 +205,15 @@ def track_initial_segment_event( ) return None + +def _track_initial_segment_event( + org_id: int, + project: Project, + replay_id, + key_id: int | None, + received: int, + is_replay_video: bool, +) -> None: if not project.flags.has_replays: first_replay_received.send_robust(project=project, sender=Project) @@ -208,7 +222,7 @@ def track_initial_segment_event( metrics.incr("replays.billing-outcome-skipped") track_outcome( org_id=org_id, - project_id=project_id, + project_id=project.id, key_id=key_id, outcome=Outcome.ACCEPTED, reason=None, @@ -220,7 +234,7 @@ def track_initial_segment_event( else: track_outcome( org_id=org_id, - project_id=project_id, + project_id=project.id, key_id=key_id, outcome=Outcome.ACCEPTED, reason=None, @@ -400,3 +414,178 @@ def emit_replay_events( log_option_events(event_meta, project.id, replay_id) report_hydration_error(event_meta, project, replay_id, replay_event) report_rage_click(event_meta, project, replay_id, replay_event) + + +# Separated I/O and compute branch. + + +class CouldNotFindProject(DropSilently): + pass + + +@sentry_sdk.trace +def _ingest_recording_separated_io_compute(message: RecordingIngestMessage) -> None: + """Ingest recording messages.""" + processed_recording = process_recording_message(message) + commit_recording_message(processed_recording) + track_recording_metadata(processed_recording) + + +@dataclasses.dataclass +class ProcessedRecordingMessage: + actions_event: ParsedEventMeta | None + filedata: bytes + filename: str + is_replay_video: bool + key_id: int | None + org_id: int + project_id: int + received: int + recording_size_uncompressed: int + recording_size: int + replay_event: dict[str, Any] | None + replay_id: str + segment_id: int + video_size: int | None + + +@sentry_sdk.trace +def process_recording_message(message: RecordingIngestMessage) -> ProcessedRecordingMessage: + set_tag("org_id", message.org_id) + set_tag("project_id", message.project_id) + + headers, segment_bytes = parse_headers(message.payload_with_headers, message.replay_id) + segment = decompress_segment(segment_bytes) + + replay_events = parse_replay_events(message, headers, segment.decompressed) + + with sentry_sdk.start_span(name="Parse replay event"): + replay_event = json.loads(message.replay_event) if message.replay_event else None + + filename = make_recording_filename( + RecordingSegmentStorageMeta( + project_id=message.project_id, + replay_id=message.replay_id, + segment_id=headers["segment_id"], + retention_days=message.retention_days, + ) + ) + + if message.replay_video: + with sentry_sdk.start_span(name="Compress video event"): + filedata = zlib.compress(pack(rrweb=segment.decompressed, video=message.replay_video)) + video_size = len(message.replay_video) + else: + filedata = segment.compressed + video_size = None + + return ProcessedRecordingMessage( + replay_events, + filedata, + filename, + is_replay_video=message.replay_video is not None, + key_id=message.key_id, + org_id=message.org_id, + project_id=message.project_id, + received=message.received, + recording_size_uncompressed=len(segment.decompressed), + recording_size=len(segment.compressed), + replay_event=replay_event, + replay_id=message.replay_id, + segment_id=headers["segment_id"], + video_size=video_size, + ) + + +@sentry_sdk.trace +def commit_recording_message(recording: ProcessedRecordingMessage) -> None: + # Write to GCS. + storage_kv.set(recording.filename, recording.filedata) + + try: + project = Project.objects.get_from_cache(id=recording.project_id) + assert isinstance(project, Project) + except Project.DoesNotExist: + logger.warning( + "Recording segment was received for a project that does not exist.", + extra={ + "project_id": recording.project_id, + "replay_id": recording.replay_id, + }, + ) + raise CouldNotFindProject() + + # Write to billing consumer if its a billable event. + if recording.segment_id == 0: + _track_initial_segment_event( + recording.org_id, + project, + recording.replay_id, + recording.key_id, + recording.received, + is_replay_video=recording.is_replay_video, + ) + + # Write to replay-event consumer. + if recording.actions_event: + emit_replay_events( + recording.actions_event, + recording.org_id, + project, + recording.replay_id, + recording.replay_event, + ) + + +@sentry_sdk.trace +def track_recording_metadata(recording: ProcessedRecordingMessage) -> None: + # Report size metrics to determine usage patterns. + _report_size_metrics( + size_compressed=recording.recording_size, + size_uncompressed=recording.recording_size_uncompressed, + ) + + if recording.video_size: + # Logging org info for bigquery + logger.info( + "sentry.replays.slow_click", + extra={ + "event_type": "mobile_event", + "org_id": recording.org_id, + "project_id": recording.project_id, + "size": len(recording.filedata), + }, + ) + + # Track the number of replay-video events we receive. + metrics.incr("replays.recording_consumer.replay_video_count") + + # Record video size for COGS analysis. + metrics.distribution( + "replays.recording_consumer.replay_video_size", + recording.video_size, + unit="byte", + ) + + # Track combined payload size for COGs analysis. + metrics.distribution( + "replays.recording_consumer.replay_video_event_size", + len(recording.filedata), + unit="byte", + ) + + +def parse_replay_events( + message: RecordingIngestMessage, headers: RecordingSegmentHeaders, segment_bytes: bytes +) -> ParsedEventMeta | None: + try: + return parse_events(json.loads(segment_bytes)) + except Exception: + logging.exception( + "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s", + message.org_id, + message.project_id, + message.replay_id, + headers["segment_id"], + ) + return None From 208360fc0696951eed2f26388e4a9240e518ba17 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 24 Feb 2025 19:23:33 -0600 Subject: [PATCH 14/49] Configure max workers --- src/sentry/replays/consumers/buffered/buffer_managers.py | 4 ++-- src/sentry/replays/consumers/buffered/consumer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py index c017b7ebb40628..4cc4ecf8037040 100644 --- a/src/sentry/replays/consumers/buffered/buffer_managers.py +++ b/src/sentry/replays/consumers/buffered/buffer_managers.py @@ -136,9 +136,9 @@ class ThreadedBufferManager: ourselves that there are no defects in the `BatchedBufferManager`. """ - def __init__(self) -> None: + def __init__(self, max_workers: int) -> None: self.fn = commit_recording_message - self.max_workers = 100 + self.max_workers = max_workers @sentry_sdk.trace def commit(self, messages: list[ProcessedRecordingMessage]) -> None: diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index d8709b4ef354a5..89f2a15408aa9c 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -53,7 +53,7 @@ def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: org_ids = set(options.get("replay.consumer.use-file-batching")) threaded = list(filter(lambda i: i.org_id not in org_ids, model.buffer)) - threaded_buffer = ThreadedBufferManager() + threaded_buffer = ThreadedBufferManager(max_workers=32) threaded_buffer.commit(threaded) # ...But others will be opted in to the batched version of uploading. This will give us a From fc8df9cd1858751e156c6ee15f499f286b6bb31d Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 24 Feb 2025 19:49:04 -0600 Subject: [PATCH 15/49] Remove conditional branch as its moved further up the hierarchy --- .../replays/usecases/ingest/__init__.py | 59 ++++++++----------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index 0b9ae88ba6c9bb..8d362e9a20b509 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -273,42 +273,29 @@ def recording_post_processor( try: segment, replay_event = parse_segment_and_replay_data(segment_bytes, replay_event_bytes) - # Conditionally use the new separated event parsing and logging logic. This way we can - # feature flag access and fix any issues we find. - if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"): - event_meta = parse_events(segment) - project = Project.objects.get_from_cache(id=message.project_id) - emit_replay_events( - event_meta, - message.org_id, - project, - message.replay_id, - replay_event, - ) - else: - actions_event = try_get_replay_actions(message, segment, replay_event) - if actions_event: - emit_replay_actions(actions_event) - - # Log canvas mutations to bigquery. - log_canvas_size_old( - message.org_id, - message.project_id, - message.replay_id, - segment, - ) - - # Log # of rrweb events to bigquery. - logger.info( - "sentry.replays.slow_click", - extra={ - "event_type": "rrweb_event_count", - "org_id": message.org_id, - "project_id": message.project_id, - "replay_id": message.replay_id, - "size": len(segment), - }, - ) + actions_event = try_get_replay_actions(message, segment, replay_event) + if actions_event: + emit_replay_actions(actions_event) + + # Log canvas mutations to bigquery. + log_canvas_size_old( + message.org_id, + message.project_id, + message.replay_id, + segment, + ) + + # Log # of rrweb events to bigquery. + logger.info( + "sentry.replays.slow_click", + extra={ + "event_type": "rrweb_event_count", + "org_id": message.org_id, + "project_id": message.project_id, + "replay_id": message.replay_id, + "size": len(segment), + }, + ) except Exception: logging.exception( "Failed to parse recording org=%s, project=%s, replay=%s, segment=%s", From 536c250e6f15856b1a8e8a9f51141655d1a3df92 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 24 Feb 2025 20:00:11 -0600 Subject: [PATCH 16/49] Use context manager --- .../replays/consumers/buffered/consumer.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 89f2a15408aa9c..a4ea7138505236 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -73,9 +73,8 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None: have no I/O here. We'll commit the I/O on flush. """ isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() - with sentry_sdk.scope.use_isolation_scope(isolation_scope): - transaction = sentry_sdk.start_transaction( + with sentry_sdk.start_transaction( name="replays.consumer.process_recording", op="replays.consumer", custom_sampling_context={ @@ -83,14 +82,11 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None: settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0 ) }, - ) - - try: - return process_recording_message(message) - except DropSilently: - return None - finally: - transaction.finish() + ): + try: + return process_recording_message(message) + except DropSilently: + return None def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: From ea571fa3f682803338205fea052f83fcab248c1c Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 24 Feb 2025 20:26:32 -0600 Subject: [PATCH 17/49] Simplify buffer flushing (for now) --- .../replays/consumers/buffered/consumer.py | 53 +++++++++++-------- .../replays/usecases/ingest/__init__.py | 1 - 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index a4ea7138505236..b69e83893f5c57 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,18 +1,16 @@ import time +from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait import sentry_sdk from django.conf import settings -from sentry import options -from sentry.replays.consumers.buffered.buffer_managers import ( - BatchedBufferManager, - ThreadedBufferManager, -) from sentry.replays.consumers.buffered.lib import Model, buffering_runtime from sentry.replays.usecases.ingest import ( DropSilently, ProcessedRecordingMessage, + commit_recording_message, process_recording_message, + track_recording_metadata, ) @@ -46,26 +44,39 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: @sentry_sdk.trace def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: - # During the transitionary period most organizations will commit following a traditional - # commit pattern... - # - # TODO: Would be nice to cache this value. - org_ids = set(options.get("replay.consumer.use-file-batching")) - - threaded = list(filter(lambda i: i.org_id not in org_ids, model.buffer)) - threaded_buffer = ThreadedBufferManager(max_workers=32) - threaded_buffer.commit(threaded) - - # ...But others will be opted in to the batched version of uploading. This will give us a - # chance to test the feature with as minimal risk as possible. - batched = list(filter(lambda i: i.org_id in org_ids, model.buffer)) - batched_buffer = BatchedBufferManager() - batched_buffer.commit(batched) - + flush_buffer(model, max_workers=32) # Update the buffer manager with the new time so we don't continuously commit in a loop! self.__last_flushed_at = time.time() +def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32) -> None: + # Use as many workers as necessary up to a limit. We don't want to start thousands of + # worker threads. + max_workers = min(len(model.buffer), max_workers) + + with ThreadPoolExecutor(max_workers=max_workers) as pool: + # We apply whatever function is defined on the class to each message in the list. This + # is useful for testing reasons (dependency injection). + futures = [pool.submit(commit_recording_message, message) for message in model.buffer] + + # Tasks can fail. We check the done set for any failures. We will wait for all the + # futures to complete before running this step or eagerly run this step if any task + # errors. + done, _ = wait(futures, return_when=FIRST_EXCEPTION) + for future in done: + exc = future.exception() + if exc is not None: + raise exc + + # Recording metadata is not tracked in the threadpool. This is because this function will + # log. Logging will acquire a lock and make our threading less useful due to the speed of + # the I/O we do in this step. + for message in model.buffer: + track_recording_metadata(message) + + return None + + def process_message(message: bytes) -> ProcessedRecordingMessage | None: """Message processing function. diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index 8d362e9a20b509..05a4ddec7a267b 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -483,7 +483,6 @@ def process_recording_message(message: RecordingIngestMessage) -> ProcessedRecor @sentry_sdk.trace def commit_recording_message(recording: ProcessedRecordingMessage) -> None: - # Write to GCS. storage_kv.set(recording.filename, recording.filedata) try: From 373388636b6359ee3249d40981d4136744a4333e Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 08:31:02 -0600 Subject: [PATCH 18/49] Update tracing logic --- .../replays/consumers/buffered/consumer.py | 42 +++++++++---------- .../replays/usecases/ingest/__init__.py | 41 +++++++++--------- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index b69e83893f5c57..f6df66284447f8 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,8 +1,8 @@ +import contextlib import time from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait import sentry_sdk -from django.conf import settings from sentry.replays.consumers.buffered.lib import Model, buffering_runtime from sentry.replays.usecases.ingest import ( @@ -10,6 +10,7 @@ ProcessedRecordingMessage, commit_recording_message, process_recording_message, + sentry_tracing, track_recording_metadata, ) @@ -42,14 +43,16 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at ) - @sentry_sdk.trace def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: - flush_buffer(model, max_workers=32) - # Update the buffer manager with the new time so we don't continuously commit in a loop! - self.__last_flushed_at = time.time() + with sentry_tracing("replays.consumers.buffered.flush_buffer"): + flush_buffer(model, max_workers=8) + # Update the buffer manager with the new time so we don't continuously commit in a + # loop! + self.__last_flushed_at = time.time() -def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32) -> None: +@sentry_sdk.trace +def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None: # Use as many workers as necessary up to a limit. We don't want to start thousands of # worker threads. max_workers = min(len(model.buffer), max_workers) @@ -57,7 +60,7 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32) with ThreadPoolExecutor(max_workers=max_workers) as pool: # We apply whatever function is defined on the class to each message in the list. This # is useful for testing reasons (dependency injection). - futures = [pool.submit(commit_recording_message, message) for message in model.buffer] + futures = [pool.submit(flush_message, message) for message in model.buffer] # Tasks can fail. We check the done set for any failures. We will wait for all the # futures to complete before running this step or eagerly run this step if any task @@ -77,27 +80,22 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int = 32) return None +@sentry_sdk.trace +def flush_message(message: ProcessedRecordingMessage) -> None: + """Message flushing function.""" + with contextlib.suppress(DropSilently): + commit_recording_message(message) + + def process_message(message: bytes) -> ProcessedRecordingMessage | None: """Message processing function. Accepts an unstructured type and returns a structured one. Other than tracing the goal is to have no I/O here. We'll commit the I/O on flush. """ - isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() - with sentry_sdk.scope.use_isolation_scope(isolation_scope): - with sentry_sdk.start_transaction( - name="replays.consumer.process_recording", - op="replays.consumer", - custom_sampling_context={ - "sample_rate": getattr( - settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0 - ) - }, - ): - try: - return process_recording_message(message) - except DropSilently: - return None + with sentry_tracing("replays.consumers.buffered.process_message"): + with contextlib.suppress(DropSilently): + return process_recording_message(message) def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: diff --git a/src/sentry/replays/usecases/ingest/__init__.py b/src/sentry/replays/usecases/ingest/__init__.py index ea5c479318dde3..104e9b5a44cc5c 100644 --- a/src/sentry/replays/usecases/ingest/__init__.py +++ b/src/sentry/replays/usecases/ingest/__init__.py @@ -4,6 +4,7 @@ import logging import time import zlib +from contextlib import contextmanager from datetime import datetime, timezone from typing import Any, TypedDict, cast @@ -91,30 +92,30 @@ class RecordingIngestMessage: replay_video: bytes | None -def ingest_recording(message_bytes: bytes) -> None: - """Ingest non-chunked recording messages.""" +@contextmanager +def sentry_tracing(name: str): + sample_rate = getattr(settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0) isolation_scope = sentry_sdk.Scope.get_isolation_scope().fork() - with sentry_sdk.scope.use_isolation_scope(isolation_scope): with sentry_sdk.start_transaction( - name="replays.consumer.process_recording", - op="replays.consumer", - custom_sampling_context={ - "sample_rate": getattr( - settings, "SENTRY_REPLAY_RECORDINGS_CONSUMER_APM_SAMPLING", 0 - ) - }, + name=name, op="replays.consumer", custom_sampling_context={"sample_rate": sample_rate} ): - try: - message = parse_recording_message(message_bytes) - if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"): - _ingest_recording_separated_io_compute(message) - else: - _ingest_recording(message) - except DropSilently: - # The message couldn't be parsed for whatever reason. We shouldn't block the consumer - # so we ignore it. - pass + yield + + +def ingest_recording(message_bytes: bytes) -> None: + """Ingest non-chunked recording messages.""" + with sentry_tracing("replays.consumer.process_recording"): + try: + message = parse_recording_message(message_bytes) + if message.org_id in options.get("replay.consumer.separate-compute-and-io-org-ids"): + _ingest_recording_separated_io_compute(message) + else: + _ingest_recording(message) + except DropSilently: + # The message couldn't be parsed for whatever reason. We shouldn't block the consumer + # so we ignore it. + pass @sentry_sdk.trace From 88747c123d9129bd174ca025e864879c648f23a6 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:06:15 -0600 Subject: [PATCH 19/49] Soften flag requirements and minor fixes --- .../replays/consumers/buffered/consumer.py | 18 +++++++++++------- .../sentry/replays/consumers/test_recording.py | 12 ++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index f6df66284447f8..9a4fa0339de0ed 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -9,6 +9,7 @@ DropSilently, ProcessedRecordingMessage, commit_recording_message, + parse_recording_message, process_recording_message, sentry_tracing, track_recording_metadata, @@ -33,8 +34,11 @@ class BufferManager: """ def __init__(self, flags: dict[str, str]) -> None: - self.__max_buffer_length = int(flags["max_buffer_length"]) - self.__max_buffer_wait = int(flags["max_buffer_wait"]) + # Flags are safely extracted and default arguments are used. + self.__max_buffer_length = int(flags.get("max_buffer_length", 8)) + self.__max_buffer_wait = int(flags.get("max_buffer_wait", 1)) + self.__max_workers = int(flags.get("max_workers", 8)) + self.__last_flushed_at = time.time() def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: @@ -45,7 +49,7 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: with sentry_tracing("replays.consumers.buffered.flush_buffer"): - flush_buffer(model, max_workers=8) + flush_buffer(model, max_workers=self.__max_workers) # Update the buffer manager with the new time so we don't continuously commit in a # loop! self.__last_flushed_at = time.time() @@ -53,9 +57,8 @@ def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: @sentry_sdk.trace def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None: - # Use as many workers as necessary up to a limit. We don't want to start thousands of - # worker threads. - max_workers = min(len(model.buffer), max_workers) + if len(model.buffer) == 0: + return None with ThreadPoolExecutor(max_workers=max_workers) as pool: # We apply whatever function is defined on the class to each message in the list. This @@ -87,7 +90,7 @@ def flush_message(message: ProcessedRecordingMessage) -> None: commit_recording_message(message) -def process_message(message: bytes) -> ProcessedRecordingMessage | None: +def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None: """Message processing function. Accepts an unstructured type and returns a structured one. Other than tracing the goal is to @@ -95,6 +98,7 @@ def process_message(message: bytes) -> ProcessedRecordingMessage | None: """ with sentry_tracing("replays.consumers.buffered.process_message"): with contextlib.suppress(DropSilently): + message = parse_recording_message(message_bytes) return process_recording_message(message) diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py index e2958e54dd7a88..90f551113adedf 100644 --- a/tests/sentry/replays/consumers/test_recording.py +++ b/tests/sentry/replays/consumers/test_recording.py @@ -581,3 +581,15 @@ def test_invalid_message(self): {"replay.consumer.separate-compute-and-io-org-ids": [self.organization.id]} ): super().test_invalid_message() + + +from sentry.replays.consumers.buffered.consumer import recording_runtime +from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory + + +class RunTimeConsumerTestCase(RecordingTestCase): + def processing_factory(self): + return PlatformStrategyFactory( + flags={"max_buffer_length": 8, "max_buffer_wait": 1}, + runtime=recording_runtime, + ) From 74e811f939510a89b24eeea400eabff5b2ed45b0 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:27:55 -0600 Subject: [PATCH 20/49] Remove buffer managers module --- .../consumers/buffered/buffer_managers.py | 169 ------------------ src/sentry/replays/models.py | 15 -- 2 files changed, 184 deletions(-) delete mode 100644 src/sentry/replays/consumers/buffered/buffer_managers.py diff --git a/src/sentry/replays/consumers/buffered/buffer_managers.py b/src/sentry/replays/consumers/buffered/buffer_managers.py deleted file mode 100644 index 4cc4ecf8037040..00000000000000 --- a/src/sentry/replays/consumers/buffered/buffer_managers.py +++ /dev/null @@ -1,169 +0,0 @@ -import dataclasses -import uuid -from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait - -import sentry_sdk - -from sentry.replays.lib.storage import storage_kv -from sentry.replays.models import FilePart -from sentry.replays.usecases.ingest import ( - ProcessedRecordingMessage, - commit_recording_message, - track_initial_segment_event, - track_recording_metadata, -) -from sentry.replays.usecases.ingest.dom_index import _initialize_publisher, emit_replay_actions - - -@dataclasses.dataclass(frozen=True) -class FilePartRow: - key: str - range_start: int - range_stop: int - - -@dataclasses.dataclass(frozen=True) -class MergedBuffer: - buffer: bytes - buffer_rows: list[FilePartRow] - filename: str - - -class BatchedBufferManager: - """Batched buffer manager. - - The batched buffer manager takes a list of messages and merges them into a single file before - committing them to permanent storage. We store filename and byte locations in a PostgreSQL - table. - """ - - @sentry_sdk.trace - def commit(self, messages: list[ProcessedRecordingMessage]) -> None: - merged_buffer = self.merge_buffer(messages) - self.commit_merged_buffer(merged_buffer) - self.bulk_track_initial_segment_events(messages) - self.bulk_emit_action_events(messages) - self.bulk_track_recording_metadata(messages) - - @sentry_sdk.trace - def merge_buffer(self, buffer: list[ProcessedRecordingMessage]) -> MergedBuffer: - def _append_part(parts: bytes, part: bytes, key: str) -> tuple[bytes, FilePartRow]: - range_start = len(parts) - range_stop = range_start + len(part) - 1 - return (parts + part, FilePartRow(key, range_start, range_stop)) - - parts = b"" - parts_rows = [] - - for item in buffer: - # The recording data is appended to the buffer and a row for tracking is - # returned. - parts, part_row = _append_part(parts, item.filedata, key=item.filename) - parts_rows.append(part_row) - - return MergedBuffer( - buffer=parts, - buffer_rows=parts_rows, - filename=f"90/{uuid.uuid4().hex}", - ) - - @sentry_sdk.trace - def commit_merged_buffer(self, buffer: MergedBuffer) -> None: - if buffer.buffer == b"": - return None - - # Storage goes first. Headers go second. If we were to store the headers first we could - # expose a record that does not exist to the user which would ultimately 404. - storage_kv.set(buffer.filename, buffer.buffer) - - # Bulk insert the filepart. - FilePart.objects.bulk_create( - FilePart( - filename=buffer.filename, - key=row.key, - range_start=row.range_start, - range_stop=row.range_stop, - ) - for row in buffer.buffer_rows - ) - - @sentry_sdk.trace - def bulk_track_initial_segment_events(self, items: list[ProcessedRecordingMessage]) -> None: - # Each item in the buffer needs to record a billing outcome. Not every segment will bill - # but we defer that behavior to the `track_initial_segment_event` function. - for item in items: - track_initial_segment_event( - item.org_id, - item.project_id, - item.replay_id, - item.segment_id, - item.key_id, - item.received, - item.is_replay_video, - ) - - @sentry_sdk.trace - def bulk_emit_action_events(self, items: list[ProcessedRecordingMessage]) -> None: - # The action events need to be emitted to Snuba. We do it asynchronously so its fast. The - # Kafka publisher is asynchronous. We need to flush to make sure the data is fully committed - # before we can consider this buffer fully flushed and commit our local offsets. - for item in items: - if item.actions_event: - emit_replay_actions(item.actions_event) - - # Ensure the replay-actions were committed to the broker before we commit this batch. - publisher = _initialize_publisher() - publisher.flush() - - @sentry_sdk.trace - def bulk_track_recording_metadata(self, items: list[ProcessedRecordingMessage]) -> None: - # Each item in the buffer needs to record metrics about itself. - for item in items: - track_recording_metadata(item) - - -class ThreadedBufferManager: - """Threaded buffer manager. - - The threaded buffer manager is the original way we uploaded files except in an application - managed thread-pool. We iterate over each processed recording, commit them in a thread, and - finally return null when all the tasks complete. It requires no changes elsewhere in the code - to accomodate. - - The goal of this class is to _not_ be clever. We want to as closely as possible immitate - current production behavior. The reason being is that a consumer refactor is a difficult task - to feature flag and we want to gradually rollout the behavior over time after assuring - ourselves that there are no defects in the `BatchedBufferManager`. - """ - - def __init__(self, max_workers: int) -> None: - self.fn = commit_recording_message - self.max_workers = max_workers - - @sentry_sdk.trace - def commit(self, messages: list[ProcessedRecordingMessage]) -> None: - # Use as many workers as necessary up to a limit. We don't want to start thousands of - # worker threads. - max_workers = min(len(messages), self.max_workers) - - with ThreadPoolExecutor(max_workers=max_workers) as pool: - # We apply whatever function is defined on the class to each message in the list. This - # is useful for testing reasons (dependency injection). - futures = [pool.submit(self.fn, message) for message in messages] - - # Tasks can fail. We check the done set for any failures. We will wait for all the - # futures to complete before running this step or eagerly run this step if any task - # errors. - done, _ = wait(futures, return_when=FIRST_EXCEPTION) - for future in done: - exc = future.exception() - if exc is not None: - raise exc - - # Recording metadata is not tracked in the threadpool. This is because this function will - # log. Logging will acquire a lock and make our threading less useful due to the speed of - # the I/O we do in this step. - for message in messages: - track_recording_metadata(message) - - return None diff --git a/src/sentry/replays/models.py b/src/sentry/replays/models.py index b20cb45efe928e..1d4099d0155728 100644 --- a/src/sentry/replays/models.py +++ b/src/sentry/replays/models.py @@ -46,18 +46,3 @@ def delete(self, *args, **kwargs): rv = super().delete(*args, **kwargs) return rv - - -@region_silo_model -class FilePart(Model): - __relocation_scope__ = RelocationScope.Excluded - - date_added = models.DateTimeField(default=timezone.now, null=False) - filename = models.CharField(null=False) - key = models.CharField(db_index=True, null=False) - range_start = models.IntegerField(null=False) - range_stop = models.IntegerField(null=False) - - class Meta: - app_label = "replays" - db_table = "replays_filepart" From 8f8e5ce2cd5a89ccd156c8399ff0006ba4ba2540 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:30:51 -0600 Subject: [PATCH 21/49] Test clean up --- tests/sentry/replays/consumers/test_recording.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py index 90f551113adedf..8f5c6073b774a4 100644 --- a/tests/sentry/replays/consumers/test_recording.py +++ b/tests/sentry/replays/consumers/test_recording.py @@ -13,6 +13,8 @@ from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording from sentry.models.organizationonboardingtask import OnboardingTask, OnboardingTaskStatus +from sentry.replays.consumers.buffered.consumer import recording_runtime +from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory from sentry.replays.consumers.recording import ProcessReplayRecordingStrategyFactory from sentry.replays.consumers.recording_buffered import ( RecordingBufferedStrategyFactory, @@ -583,13 +585,9 @@ def test_invalid_message(self): super().test_invalid_message() -from sentry.replays.consumers.buffered.consumer import recording_runtime -from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory - - -class RunTimeConsumerTestCase(RecordingTestCase): +class BufferedRunTimeConsumerTestCase(RecordingTestCase): def processing_factory(self): return PlatformStrategyFactory( - flags={"max_buffer_length": 8, "max_buffer_wait": 1}, + flags={"max_buffer_length": 8, "max_buffer_wait": 1, "max_workers": 8}, runtime=recording_runtime, ) From fac3204d830a4a615bf5b3a6557c97ad8e28f817 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:40:39 -0600 Subject: [PATCH 22/49] Fix unit test --- tests/sentry/replays/unit/consumers/test_recording.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 22bb3701c74f7a..0872ee2184c2c2 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -1,10 +1,12 @@ import time import uuid +import zlib import msgpack from sentry.replays.consumers.buffered.consumer import process_message from sentry.replays.usecases.ingest import ProcessedRecordingMessage +from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta def test_process_message_invalid(): @@ -29,10 +31,12 @@ def test_process_message(): } ) ) + + expected_event_metadata = ParsedEventMeta([], [], [], [], [], []) assert result is not None assert result == ProcessedRecordingMessage( - actions_event=[], - filedata=b"[]", + actions_event=expected_event_metadata, + filedata=zlib.compress(b"[]"), filename=result.filename, is_replay_video=False, key_id=3, @@ -41,7 +45,9 @@ def test_process_message(): received=result.received, recording_size_uncompressed=2, recording_size=result.recording_size, + retention_days=30, replay_id=result.replay_id, segment_id=0, video_size=None, + replay_event=None, ) From ae59eb9439f8cc9d6881edeb2285d75dcb753e02 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:41:39 -0600 Subject: [PATCH 23/49] Add explicit return --- src/sentry/replays/consumers/buffered/consumer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 9a4fa0339de0ed..2220960d4813d2 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -100,6 +100,7 @@ def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None: with contextlib.suppress(DropSilently): message = parse_recording_message(message_bytes) return process_recording_message(message) + return None def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: From a6a32e52b3531eba361b7d5e57ef7531a387dce9 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:42:57 -0600 Subject: [PATCH 24/49] Fix typing --- tests/sentry/replays/consumers/test_recording.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py index 8f5c6073b774a4..55ddb7d1cc79c1 100644 --- a/tests/sentry/replays/consumers/test_recording.py +++ b/tests/sentry/replays/consumers/test_recording.py @@ -588,6 +588,6 @@ def test_invalid_message(self): class BufferedRunTimeConsumerTestCase(RecordingTestCase): def processing_factory(self): return PlatformStrategyFactory( - flags={"max_buffer_length": 8, "max_buffer_wait": 1, "max_workers": 8}, + flags={"max_buffer_length": "8", "max_buffer_wait": "1", "max_workers": "8"}, runtime=recording_runtime, ) From f1026ca2574f8fc79a312581ff67e1ffc4419b9b Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:54:32 -0600 Subject: [PATCH 25/49] Remove unused option --- src/sentry/options/defaults.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 210801aa64d3dd..85cd3215d0792a 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -456,12 +456,6 @@ default=None, flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE, ) -register( - "replay.consumer.use-file-batching", - type=Sequence, - default=[], - flags=FLAG_ALLOW_EMPTY | FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE, -) # Globally disables replay-video. register( "replay.replay-video.disabled", From b363596c99423c60eec401cee343e8f701846cff Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 09:56:15 -0600 Subject: [PATCH 26/49] Reset dom_index module to align with master --- .../replays/usecases/ingest/dom_index.py | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/src/sentry/replays/usecases/ingest/dom_index.py b/src/sentry/replays/usecases/ingest/dom_index.py index ecf0f5eb0ae340..b54ba8535ddf63 100644 --- a/src/sentry/replays/usecases/ingest/dom_index.py +++ b/src/sentry/replays/usecases/ingest/dom_index.py @@ -63,22 +63,6 @@ class ReplayActionsEvent(TypedDict): type: Literal["replay_event"] -def parse_and_emit_replay_actions( - project: Project, - replay_id: str, - retention_days: int, - segment_data: list[dict[str, Any]], - replay_event: dict[str, Any] | None, - org_id: int | None = None, -) -> None: - with metrics.timer("replays.usecases.ingest.dom_index.parse_and_emit_replay_actions"): - message = parse_replay_actions( - project, replay_id, retention_days, segment_data, replay_event, org_id=org_id - ) - if message is not None: - emit_replay_actions(message) - - @sentry_sdk.trace def emit_replay_actions(action: ReplayActionsEvent) -> None: publisher = _initialize_publisher() @@ -237,14 +221,13 @@ def _get_testid(container: dict[str, str]) -> str: ) -def _initialize_publisher(asynchronous: bool = True) -> KafkaPublisher: +def _initialize_publisher() -> KafkaPublisher: global replay_publisher if replay_publisher is None: config = kafka_config.get_topic_definition(Topic.INGEST_REPLAY_EVENTS) replay_publisher = KafkaPublisher( - kafka_config.get_kafka_producer_cluster_options(config["cluster"]), - asynchronous=asynchronous, + kafka_config.get_kafka_producer_cluster_options(config["cluster"]) ) return replay_publisher From c656420452f32a4e0fe7a02ed2e8aa100b34260f Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 10:17:02 -0600 Subject: [PATCH 27/49] Update buffering run-time coverage --- .../unit/consumers/test_buffering_runtime.py | 60 +++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py index 3ee061e764f3bb..6802ef3f9c7ab7 100644 --- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py @@ -1,3 +1,4 @@ +import pytest from arroyo.types import Partition, Topic from sentry.replays.consumers.buffered.lib import Model, buffering_runtime @@ -11,16 +12,27 @@ def buffer_runtime(sink): return buffering_runtime( init_fn=lambda _: Model( + # The buffer is an arbitrary list of items but it could be anything. For example a + # string which you could perpetually append to. buffer=[], + # Commit when the buffer has 5 messages in it. can_flush=lambda m: len(m.buffer) == 5, + # Flush in this case is just pushing the buffer into a mock class which holds the + # messages in memory. This is easy for us to test and we don't need to mock out any + # service code which is both unknown to the RunTime and can be tested independently. do_flush=lambda m: sink.accept(m.buffer), + # This is the offsets tracking object. The RunTime manages appending. offsets={}, ), + # Our process function is a simple lambda which checks that the message is a valid + # integer and is less than 5. There's no exception handling here. That could crash the + # consumer! Its up to the application developer to manage. process_fn=lambda m: int(m) if int(m) < 5 else None, ) def test_runtime_setup(): + """Test RunTime initialization.""" runtime = buffer_runtime(MockSink()) runtime.setup({}, commit=MockCommit()) assert runtime.model.buffer == [] @@ -28,6 +40,11 @@ def test_runtime_setup(): def test_buffering_runtime_submit(): + """Test message recived from the platform. + + Messages are buffered up until a point (see the RunTime definition at the top of the module). + When we cross the threshold the RunTime should order a buffer flush action. + """ mock_commit = MockCommit() sink = MockSink() @@ -66,7 +83,29 @@ def test_buffering_runtime_submit(): assert mock_commit.commit == {Partition(Topic("a"), 1): 2} -def test_buffering_runtime_publish(): +def test_buffering_runtime_submit_invalid_message(): + """Test invalid message recived from the platform.""" + mock_commit = MockCommit() + sink = MockSink() + + runtime = buffer_runtime(sink) + runtime.setup({}, mock_commit) + assert runtime.model.buffer == [] + assert runtime.model.offsets == {} + assert sink.accepted == [] + assert mock_commit.commit == {} + + # Assert exceptional behavior is not handled. + with pytest.raises(ValueError): + runtime.submit(make_kafka_message(b"hello")) + + +def test_buffering_runtime_join(): + """Test the RunTime received a join message from the platform. + + When the consumer is ordered to shutdown a "join" message will be submitted to the RunTime. + When that happens we rush to flush the buffer and commit whatever we can. + """ mock_commit = MockCommit() sink = MockSink() @@ -89,9 +128,22 @@ def test_buffering_runtime_publish(): assert sink.accepted == [1, 1, 1] assert mock_commit.commit == {Partition(Topic("a"), 1): 2} - # Reset the mocks. - mock_commit.commit = {} - sink.accepted = [] + +def test_buffering_runtime_poll(): + """Test the RunTime received a poll message from the platform. + + This will happen periodically. In the event the consumer does not receive messages for some + interval the poll message will be published will be called. + """ + mock_commit = MockCommit() + sink = MockSink() + + runtime = buffer_runtime(sink) + runtime.setup({}, mock_commit) + assert runtime.model.buffer == [] + assert runtime.model.offsets == {} + assert sink.accepted == [] + assert mock_commit.commit == {} # Assert poll does not automatically empty the buffer. runtime.submit(make_kafka_message(b"1")) From 29bb826ddfbfc322f65d473a631e38bc2e6aea88 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 10:52:15 -0600 Subject: [PATCH 28/49] Update test ordering --- tests/sentry/replays/unit/consumers/test_recording.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 0872ee2184c2c2..162dfdf065d633 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -9,11 +9,6 @@ from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta -def test_process_message_invalid(): - result = process_message(msgpack.packb(b"hello, world!")) - assert result is None - - def test_process_message(): result = process_message( msgpack.packb( @@ -51,3 +46,8 @@ def test_process_message(): video_size=None, replay_event=None, ) + + +def test_process_message_invalid(): + result = process_message(msgpack.packb(b"hello, world!")) + assert result is None From b4c64775df4ba1d4c9b69848b7f155834e7a0909 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 11:47:49 -0600 Subject: [PATCH 29/49] Add offset committing test --- .../unit/consumers/test_buffering_runtime.py | 20 +++++++++++++++++++ .../replays/unit/consumers/test_helpers.py | 6 ++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py index 6802ef3f9c7ab7..4f68108fc9e332 100644 --- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py @@ -162,3 +162,23 @@ def test_buffering_runtime_poll(): assert runtime.model.buffer == [] assert sink.accepted == [1, 2, 3, 4, 5] assert mock_commit.commit == {Partition(Topic("a"), 1): 2} + + +def test_buffering_runtime_committed_offsets(): + """Test offset commit.""" + mock_commit = MockCommit() + sink = MockSink() + + runtime = buffer_runtime(sink) + runtime.setup({}, mock_commit) + assert runtime.model.buffer == [] + assert runtime.model.offsets == {} + assert sink.accepted == [] + assert mock_commit.commit == {} + + # Assert committed offsets are correct. + runtime.submit(make_kafka_message(b"1")) + runtime.submit(make_kafka_message(b"1", topic="b")) + runtime.submit(make_kafka_message(b"1", offset=2)) + runtime.publish("join") + assert mock_commit.commit == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2} diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py index 1f4eecbc060674..3543885e55b07b 100644 --- a/tests/sentry/replays/unit/consumers/test_helpers.py +++ b/tests/sentry/replays/unit/consumers/test_helpers.py @@ -21,7 +21,9 @@ def accept(self, buffer): self.accepted.extend(buffer) -def make_kafka_message(message: bytes): +def make_kafka_message(message: bytes, topic: str = "a", index: int = 1, offset: int = 1): return Message( - BrokerValue(KafkaPayload(b"k", message, []), Partition(Topic("a"), 1), 1, datetime.now()) + BrokerValue( + KafkaPayload(b"k", message, []), Partition(Topic(topic), index), offset, datetime.now() + ) ) From 2572955d773d2d71f7b32c3719ed8285e19a611d Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 14:37:19 -0600 Subject: [PATCH 30/49] Add docs --- src/sentry/replays/consumers/buffered/lib.py | 6 ++- .../replays/consumers/buffered/platform.py | 42 ++++++++++++------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index ff89587a54ce7b..2b69094f109aa9 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -1,4 +1,8 @@ -"""Buffered RunTime implementation.""" +"""Buffered RunTime implementation. + +The Buffering RunTime eagerly processes messages as they are received and waits until its buffer +is full before flushing those messages. The goal being to achieve efficiencies from batched I/O. +""" from collections.abc import Callable, MutableMapping from dataclasses import dataclass diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index 252377c3f3b2a2..fda10dfa8579f9 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -70,22 +70,20 @@ def join(self, timeout: float | None = None) -> None: CommitProtocol = Callable[[Mapping[Partition, int]], None] -Model = TypeVar("Model") -Msg = TypeVar("Msg") - -class BackPressure: - """Instructs the RunTime to back-pressure the platform. - - Does not accept a `msg` argument as we do not expect the platform to fail. - """ +# A Model represents the state of your application. It is a type variable and the RunTime is +# generic over it. Your state can be anything from a simple integer to a large class with many +# fields. +Model = TypeVar("Model") - pass +# A Msg represents the commands an application can issue to itself. These commands update the state +# and optionally issue commands to the RunTime. +Msg = TypeVar("Msg") @dataclass(frozen=True) class Commit(Generic[Msg]): - """Instructs the RunTime to commit the platform. + """Instructs the RunTime to commit the message offsets. Because the RunTime is based on a Kafka platform there is an expectation that the application should trigger commits when necessary. While the application can't be sure its running Kafka @@ -109,8 +107,10 @@ class Task(Generic[Msg]): msg: Msg -# A "Cmd" is the union of all the commands an application can issue to the RunTime. -Cmd = BackPressure | Commit[Msg] | Nothing | Task[Msg] +# A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime +# accepts these commands and handles them in some pre-defined way. Commands are fixed and can not +# be registered on a per application basis. +Cmd = Commit[Msg] | Nothing | Task[Msg] @dataclass(frozen=True) @@ -138,11 +138,22 @@ class Poll(Generic[Msg]): name = "poll" -# A "Sub" is the union of all the events an application can subscribe to. +# A "Sub" is the union of all the commands the Platform can issue to an application. The Platform +# will occassionally emit actions which are intercepted by the RunTime. The RunTime translates +# these actions into a set of predefined subscriptions. These subscriptions are exposed to the +# application and the developer is free to handle them in the way they see fit. Sub = Join[Msg] | Poll[Msg] class RunTime(Generic[Model, Msg]): + """RunTime object. + + The RunTime is an intermediate data structure which manages communication between the platform + and the application. It formalizes state transformations and abstracts the logic of the + platform. Commands are declaratively issued rather than defined within the logic of the + application. Commands can be issued bidirectionally with "Cmd" types flowing from the + application to the platform and "Sub" types flowing from the platform to the application. + """ def __init__( self, @@ -171,6 +182,8 @@ def model(self) -> Model: return self.__model def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None: + # NOTE: Could this be a factory function that produces RunTimes instead? That way we + # don't need the assert checks on model and commit. self.__commit = commit model, cmd = self.init(flags) @@ -212,9 +225,6 @@ def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None: return None match cmd: - case BackPressure(): - # TODO - return None case Commit(msg=msg, offsets=offsets): self.commit(offsets) return self.__handle_msg(msg) From 7ca74623708238103c9954c1377038294690635e Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 15:03:37 -0600 Subject: [PATCH 31/49] More docstring fixes --- src/sentry/replays/consumers/buffered/consumer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 2220960d4813d2..a0c03264f36692 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,3 +1,5 @@ +"""Session Replay recording consumer implementation.""" + import contextlib import time from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait @@ -61,8 +63,6 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> N return None with ThreadPoolExecutor(max_workers=max_workers) as pool: - # We apply whatever function is defined on the class to each message in the list. This - # is useful for testing reasons (dependency injection). futures = [pool.submit(flush_message, message) for message in model.buffer] # Tasks can fail. We check the done set for any failures. We will wait for all the @@ -85,7 +85,6 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> N @sentry_sdk.trace def flush_message(message: ProcessedRecordingMessage) -> None: - """Message flushing function.""" with contextlib.suppress(DropSilently): commit_recording_message(message) From fb5731e143827bab2ad9e2667cbfe7fb774648a7 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 15:40:41 -0600 Subject: [PATCH 32/49] Add typing to flags and factory module --- .../replays/consumers/buffered/consumer.py | 18 +++++++---- .../replays/consumers/buffered/factory.py | 26 +++++++++++++++ src/sentry/replays/consumers/buffered/lib.py | 9 +++--- .../replays/consumers/buffered/platform.py | 32 +++++-------------- 4 files changed, 51 insertions(+), 34 deletions(-) create mode 100644 src/sentry/replays/consumers/buffered/factory.py diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index a0c03264f36692..f554538fa189fc 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -3,6 +3,7 @@ import contextlib import time from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait +from typing import TypedDict import sentry_sdk @@ -18,6 +19,12 @@ ) +class Flags(TypedDict): + max_buffer_length: int + max_buffer_wait: int + max_workers: int + + class BufferManager: """Buffer manager. @@ -35,11 +42,10 @@ class BufferManager: if you wanted to expose the state across more locations in the application. """ - def __init__(self, flags: dict[str, str]) -> None: - # Flags are safely extracted and default arguments are used. - self.__max_buffer_length = int(flags.get("max_buffer_length", 8)) - self.__max_buffer_wait = int(flags.get("max_buffer_wait", 1)) - self.__max_workers = int(flags.get("max_workers", 8)) + def __init__(self, flags: Flags) -> None: + self.__max_buffer_length = flags["max_buffer_length"] + self.__max_buffer_wait = flags["max_buffer_wait"] + self.__max_workers = flags["max_workers"] self.__last_flushed_at = time.time() @@ -102,7 +108,7 @@ def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None: return None -def init(flags: dict[str, str]) -> Model[ProcessedRecordingMessage]: +def init(flags: Flags) -> Model[ProcessedRecordingMessage]: """Return the initial state of the application.""" buffer = BufferManager(flags) return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py new file mode 100644 index 00000000000000..036ee4868342ad --- /dev/null +++ b/src/sentry/replays/consumers/buffered/factory.py @@ -0,0 +1,26 @@ +from collections.abc import Mapping + +from arroyo.backends.kafka.consumer import KafkaPayload +from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory +from arroyo.types import Commit as ArroyoCommit +from arroyo.types import Partition + +from sentry.replays.consumers.buffered.consumer import Flags, recording_runtime +from sentry.replays.consumers.buffered.platform import PlatformStrategy + + +class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]): + + def __init__(self, max_buffer_length: int, max_buffer_wait: int, max_workers: int) -> None: + self.flags: Flags = { + "max_buffer_length": max_buffer_length, + "max_buffer_wait": max_buffer_wait, + "max_workers": max_workers, + } + + def create_with_partitions( + self, + commit: ArroyoCommit, + partitions: Mapping[Partition, int], + ) -> ProcessingStrategy[KafkaPayload]: + return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_runtime) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index 2b69094f109aa9..9c464415465ad1 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -14,6 +14,7 @@ from sentry.replays.consumers.buffered.platform import ( Cmd, Commit, + Flags, Join, Nothing, Poll, @@ -78,8 +79,8 @@ def process( def init( - init_fn: Callable[[dict[str, str]], Model[Item]], - flags: dict[str, str], + init_fn: Callable[[Flags], Model[Item]], + flags: Flags, ) -> tuple[Model[Item], Cmd[Msg[Item]]]: return (init_fn(flags), Nothing()) @@ -119,9 +120,9 @@ def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]: def buffering_runtime( - init_fn: Callable[[dict[str, str]], Model[Item]], + init_fn: Callable[[Flags], Model[Item]], process_fn: Callable[[bytes], Item | None], -) -> RunTime[Model[Item], Msg[Item]]: +) -> RunTime[Model[Item], Msg[Item], Flags]: return RunTime( init=partial(init, init_fn), process=partial(process, process_fn), diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index fda10dfa8579f9..f1819d6ee7fe7c 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -4,36 +4,18 @@ from typing import Generic, TypeVar from arroyo.backends.kafka.consumer import KafkaPayload -from arroyo.processing.strategies import ( - CommitOffsets, - ProcessingStrategy, - ProcessingStrategyFactory, -) +from arroyo.processing.strategies import CommitOffsets, ProcessingStrategy from arroyo.types import Commit as ArroyoCommit from arroyo.types import Message, Partition, Value -class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]): - - def __init__(self, flags: dict[str, str], runtime: "RunTime[Model, Msg]") -> None: - self.flags = flags - self.runtime = runtime - - def create_with_partitions( - self, - commit: ArroyoCommit, - partitions: Mapping[Partition, int], - ) -> ProcessingStrategy[KafkaPayload]: - return PlatformStrategy(commit=commit, flags=self.flags, runtime=self.runtime) - - class PlatformStrategy(ProcessingStrategy[KafkaPayload]): def __init__( self, commit: ArroyoCommit, - flags: dict[str, str], - runtime: "RunTime[Model, Msg]", + flags: "Flags", + runtime: "RunTime[Model, Msg, Flags]", ) -> None: # The RunTime is made aware of the commit strategy. It will # submit the partition, offset mapping it wants committed. @@ -80,6 +62,8 @@ def join(self, timeout: float | None = None) -> None: # and optionally issue commands to the RunTime. Msg = TypeVar("Msg") +Flags = TypeVar("Flags") + @dataclass(frozen=True) class Commit(Generic[Msg]): @@ -145,7 +129,7 @@ class Poll(Generic[Msg]): Sub = Join[Msg] | Poll[Msg] -class RunTime(Generic[Model, Msg]): +class RunTime(Generic[Model, Msg, Flags]): """RunTime object. The RunTime is an intermediate data structure which manages communication between the platform @@ -157,7 +141,7 @@ class RunTime(Generic[Model, Msg]): def __init__( self, - init: Callable[[dict[str, str]], tuple[Model, Cmd[Msg] | None]], + init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]], process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None], subscription: Callable[[Model], list[Sub[Msg]]], update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]], @@ -181,7 +165,7 @@ def model(self) -> Model: assert self.__model is not None return self.__model - def setup(self, flags: dict[str, str], commit: CommitProtocol) -> None: + def setup(self, flags: Flags, commit: CommitProtocol) -> None: # NOTE: Could this be a factory function that produces RunTimes instead? That way we # don't need the assert checks on model and commit. self.__commit = commit From 17eb0118857bf955ecb090da924e8fe7e8a19041 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Thu, 27 Feb 2025 19:55:13 -0600 Subject: [PATCH 33/49] Adopt buffered strategy in callsite --- bin/mock-replay-recording | 75 ++++++++++++++++++++++++++++++++ src/sentry/consumers/__init__.py | 25 +++-------- 2 files changed, 81 insertions(+), 19 deletions(-) create mode 100755 bin/mock-replay-recording diff --git a/bin/mock-replay-recording b/bin/mock-replay-recording new file mode 100755 index 00000000000000..dd4fd11c9d577d --- /dev/null +++ b/bin/mock-replay-recording @@ -0,0 +1,75 @@ +#!/usr/bin/env python +""". + +Helpful commands: + + - Check if offsets are committed correctly. + - `docker exec -it kafka-kafka-1 kafka-get-offsets --bootstrap-server localhost:9092 --topic ingest-replay-recordings` + - Returns in the form of :: +""" +from sentry.runner import configure + +configure() +import logging +import os +import time +import uuid + +import click +from arroyo import Topic as ArroyoTopic +from arroyo.backends.kafka import KafkaPayload, KafkaProducer, build_kafka_configuration +from sentry_kafka_schemas.codecs import Codec +from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sentry.conf.server") + +import django + +django.setup() + +from sentry.conf.types.kafka_definition import Topic, get_topic_codec +from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition + +logger = logging.getLogger(__name__) + + +def get_producer() -> KafkaProducer: + cluster_name = get_topic_definition(Topic.INGEST_REPLAYS_RECORDINGS)["cluster"] + producer_config = get_kafka_producer_cluster_options(cluster_name) + return KafkaProducer(build_kafka_configuration(default_config=producer_config)) + + +RECORDING_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) + + +@click.command() +@click.option("--organization-id", type=int, required=True, help="Organization ID") +@click.option("--project-id", type=int, required=True, help="Project ID") +def main(organization_id: int, project_id: int) -> None: + """Produce a mock uptime result message to the INGEST_REPLAYS_RECORDINGS topic.""" + message: ReplayRecording = { + "key_id": None, + "org_id": organization_id, + "payload": b'{"segment_id":10}\n[]', + "project_id": project_id, + "received": int(time.time()), + "replay_event": None, + "replay_id": uuid.uuid4().hex, + "replay_video": None, + "retention_days": 30, + "type": "replay_recording_not_chunked", + "version": 1, + } + + producer = get_producer() + topic = get_topic_definition(Topic.INGEST_REPLAYS_RECORDINGS)["real_topic_name"] + payload = KafkaPayload(None, RECORDING_CODEC.encode(message), []) + + producer.produce(ArroyoTopic(topic), payload) + producer.close() + + logger.info("Successfully produced message to %s", topic) + + +if __name__ == "__main__": + main() diff --git a/src/sentry/consumers/__init__.py b/src/sentry/consumers/__init__.py index f3e5aa93484434..785a2c452877e5 100644 --- a/src/sentry/consumers/__init__.py +++ b/src/sentry/consumers/__init__.py @@ -84,24 +84,11 @@ def ingest_replay_recordings_options() -> list[click.Option]: def ingest_replay_recordings_buffered_options() -> list[click.Option]: """Return a list of ingest-replay-recordings-buffered options.""" - options = [ - click.Option( - ["--max-buffer-message-count", "max_buffer_message_count"], - type=int, - default=100, - ), - click.Option( - ["--max-buffer-size-in-bytes", "max_buffer_size_in_bytes"], - type=int, - default=2_500_000, - ), - click.Option( - ["--max-buffer-time-in-seconds", "max_buffer_time_in_seconds"], - type=int, - default=1, - ), + return [ + click.Option(["--max-buffer-length", "max_buffer_length"], type=int, default=8), + click.Option(["--max-buffer-wait", "max_buffer_wait"], type=int, default=1), + click.Option(["--max-workers", "max_workers"], type=int, default=8), ] - return options def ingest_monitors_options() -> list[click.Option]: @@ -269,8 +256,8 @@ def ingest_transactions_options() -> list[click.Option]: }, "ingest-replay-recordings": { "topic": Topic.INGEST_REPLAYS_RECORDINGS, - "strategy_factory": "sentry.replays.consumers.recording.ProcessReplayRecordingStrategyFactory", - "click_options": ingest_replay_recordings_options(), + "strategy_factory": "sentry.replays.consumers.buffered.factory.PlatformStrategyFactory", + "click_options": ingest_replay_recordings_buffered_options(), }, "ingest-replay-recordings-buffered": { "topic": Topic.INGEST_REPLAYS_RECORDINGS, From 0c79eac10326f8ca7651316b9cbc191ec7b1e6b2 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 09:29:14 -0600 Subject: [PATCH 34/49] Add script for mocking recordings --- bin/mock-replay-recording | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/mock-replay-recording b/bin/mock-replay-recording index dd4fd11c9d577d..2f2d27109ace96 100755 --- a/bin/mock-replay-recording +++ b/bin/mock-replay-recording @@ -3,9 +3,10 @@ Helpful commands: + - Run the consumer. + - `sentry run consumer ingest-replay-recordings --consumer-group 0` - Check if offsets are committed correctly. - - `docker exec -it kafka-kafka-1 kafka-get-offsets --bootstrap-server localhost:9092 --topic ingest-replay-recordings` - - Returns in the form of :: + - `docker exec -it kafka-kafka-1 kafka-consumer-groups --bootstrap-server localhost:9092 --describe --group 0` """ from sentry.runner import configure @@ -50,7 +51,7 @@ def main(organization_id: int, project_id: int) -> None: message: ReplayRecording = { "key_id": None, "org_id": organization_id, - "payload": b'{"segment_id":10}\n[]', + "payload": b'{"segment_id"', "project_id": project_id, "received": int(time.time()), "replay_event": None, From 30ed5d645bcb5d6f72b11090d86f26b863daedee Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 09:30:09 -0600 Subject: [PATCH 35/49] Add handling for appending offsets when the message is not buffered --- src/sentry/replays/consumers/buffered/lib.py | 37 ++++++++++++-------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index 9c464415465ad1..fa6d7f4b0dded8 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -43,6 +43,11 @@ class Append(Generic[Item]): offset: MutableMapping[Partition, int] +@dataclass(frozen=True) +class AppendOffset: + offset: MutableMapping[Partition, int] + + class Committed: """The platform committed offsets. Our buffer is now completely done.""" @@ -61,8 +66,12 @@ class Polled: pass +class TryFlush: + pass + + # A "Msg" is the union of all application messages our RunTime will accept. -Msg = Append[Item] | Committed | Flush | Polled +Msg = Append[Item] | AppendOffset | Committed | Flush | Polled | TryFlush def process( @@ -70,12 +79,12 @@ def process( model: Model[Item], message: bytes, offset: MutableMapping[Partition, int], -) -> Msg[Item] | None: +) -> Msg[Item]: item = process_fn(message) if item: return Append(item=item, offset=offset) else: - return None + return AppendOffset(offset=offset) def init( @@ -90,26 +99,26 @@ def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Ite case Append(item=item, offset=offset): model.buffer.append(item) model.offsets.update(offset) - if model.can_flush(model): - return (model, Task(msg=Flush())) - else: - return (model, None) + return (model, Task(msg=TryFlush())) + case AppendOffset(offset=offset): + model.offsets.update(offset) + return (model, Task(msg=TryFlush())) + case Committed(): + return (model, None) case Flush(): - # What should happen if we fail? If you raise an exception the platform will restart - # from the last checkpoint -- which is standard behavior. We could be more clever here - # and provide error handling facilities or we could accept that this problem gets too - # complicated to reasonably abstract and have developers implement their own buffering - # consumer. model.do_flush(model) model.buffer = [] return (model, Commit(msg=Committed(), offsets=model.offsets)) - case Committed(): - return (model, None) case Polled(): if model.can_flush(model): return (model, Task(msg=Flush())) else: return (model, None) + case TryFlush(): + if model.can_flush(model): + return (model, Task(msg=Flush())) + else: + return (model, None) def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]: From 52d8616519c380d3f42e161a203adf1e2f2a214f Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 09:45:29 -0600 Subject: [PATCH 36/49] Add commit coverage --- .../replays/unit/consumers/test_recording.py | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 162dfdf065d633..09b49bda310df9 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -3,10 +3,18 @@ import zlib import msgpack +from arroyo.types import Partition +from arroyo.types import Topic as ArroyoTopic +from sentry_kafka_schemas.codecs import Codec +from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording -from sentry.replays.consumers.buffered.consumer import process_message +from sentry.conf.types.kafka_definition import Topic, get_topic_codec +from sentry.replays.consumers.buffered.consumer import process_message, recording_runtime from sentry.replays.usecases.ingest import ProcessedRecordingMessage from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta +from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message + +RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) def test_process_message(): @@ -51,3 +59,33 @@ def test_process_message(): def test_process_message_invalid(): result = process_message(msgpack.packb(b"hello, world!")) assert result is None + + +def test_commit_invalid_message(): + """Assert invalid messages have their offsets staged for commit.""" + mock_commit = MockCommit() + + recording_runtime.setup( + {"max_buffer_length": 100, "max_buffer_wait": 100, "max_workers": 1}, mock_commit + ) + + message: ReplayRecording = { + "key_id": None, + "org_id": 1, + "payload": b"invalid", + "project_id": 1, + "received": int(time.time()), + "replay_event": None, + "replay_id": uuid.uuid4().hex, + "replay_video": None, + "retention_days": 30, + "type": "replay_recording_not_chunked", + "version": 1, + } + + message = RECORDINGS_CODEC.encode(message) + kafka_message = make_kafka_message(message) + + recording_runtime.submit(kafka_message) + assert recording_runtime.model.offsets == {Partition(ArroyoTopic("a"), 1): 2} + assert mock_commit.commit == {} From b3344007c66dd3e809d364fc4d3f7ea5e39c5633 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 09:49:33 -0600 Subject: [PATCH 37/49] Assert messages are committed regardless of if they're appended to the buffer --- .../sentry/replays/unit/consumers/test_buffering_runtime.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py index 4f68108fc9e332..ce65f9ed076318 100644 --- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py @@ -178,7 +178,9 @@ def test_buffering_runtime_committed_offsets(): # Assert committed offsets are correct. runtime.submit(make_kafka_message(b"1")) - runtime.submit(make_kafka_message(b"1", topic="b")) - runtime.submit(make_kafka_message(b"1", offset=2)) + runtime.submit(make_kafka_message(b"10", topic="b")) + runtime.submit(make_kafka_message(b"100", offset=2)) + assert runtime.model.buffer == [1] + assert runtime.model.offsets == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2} runtime.publish("join") assert mock_commit.commit == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2} From 8618e296c9ff7eff5abe92b5ae7654e31f1fb310 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 10:22:50 -0600 Subject: [PATCH 38/49] Fix typing --- tests/sentry/replays/consumers/test_recording.py | 8 ++------ tests/sentry/replays/unit/consumers/test_recording.py | 9 ++++----- tests/sentry/replays/unit/consumers/test_runtime.py | 8 ++++---- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/tests/sentry/replays/consumers/test_recording.py b/tests/sentry/replays/consumers/test_recording.py index 55ddb7d1cc79c1..627de49ac7046d 100644 --- a/tests/sentry/replays/consumers/test_recording.py +++ b/tests/sentry/replays/consumers/test_recording.py @@ -13,8 +13,7 @@ from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording from sentry.models.organizationonboardingtask import OnboardingTask, OnboardingTaskStatus -from sentry.replays.consumers.buffered.consumer import recording_runtime -from sentry.replays.consumers.buffered.platform import PlatformStrategyFactory +from sentry.replays.consumers.buffered.factory import PlatformStrategyFactory from sentry.replays.consumers.recording import ProcessReplayRecordingStrategyFactory from sentry.replays.consumers.recording_buffered import ( RecordingBufferedStrategyFactory, @@ -587,7 +586,4 @@ def test_invalid_message(self): class BufferedRunTimeConsumerTestCase(RecordingTestCase): def processing_factory(self): - return PlatformStrategyFactory( - flags={"max_buffer_length": "8", "max_buffer_wait": "1", "max_workers": "8"}, - runtime=recording_runtime, - ) + return PlatformStrategyFactory(max_buffer_length=8, max_buffer_wait=1, max_workers=8) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 09b49bda310df9..fbbb9f50839a2c 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -5,7 +5,6 @@ import msgpack from arroyo.types import Partition from arroyo.types import Topic as ArroyoTopic -from sentry_kafka_schemas.codecs import Codec from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording from sentry.conf.types.kafka_definition import Topic, get_topic_codec @@ -14,7 +13,7 @@ from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message -RECORDINGS_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) +RECORDINGS_CODEC = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) def test_process_message(): @@ -72,7 +71,7 @@ def test_commit_invalid_message(): message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b"invalid", + "payload": b"invalid", # type: ignore[typeddict-item] "project_id": 1, "received": int(time.time()), "replay_event": None, @@ -83,8 +82,8 @@ def test_commit_invalid_message(): "version": 1, } - message = RECORDINGS_CODEC.encode(message) - kafka_message = make_kafka_message(message) + message_bytes = RECORDINGS_CODEC.encode(message) + kafka_message = make_kafka_message(message_bytes) recording_runtime.submit(kafka_message) assert recording_runtime.model.offsets == {Partition(ArroyoTopic("a"), 1): 2} diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py index 179d5cf5760471..c8ecfdbefabb64 100644 --- a/tests/sentry/replays/unit/consumers/test_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_runtime.py @@ -2,7 +2,7 @@ from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message -def counter_runtime() -> RunTime[int, str]: +def counter_runtime() -> RunTime[int, str, None]: def init(_): return (22, None) @@ -42,14 +42,14 @@ def subscription(_): def test_runtime_setup(): runtime = counter_runtime() - runtime.setup({}, commit=MockCommit()) + runtime.setup(None, commit=MockCommit()) assert runtime.model == 22 def test_runtime_submit(): # RunTime defaults to a start point of 22. runtime = counter_runtime() - runtime.setup({}, commit=MockCommit()) + runtime.setup(None, commit=MockCommit()) assert runtime.model == 22 # Two incr commands increase the count by 2. @@ -72,7 +72,7 @@ def test_runtime_submit(): def test_runtime_publish(): # RunTime defaults to a start point of 22. runtime = counter_runtime() - runtime.setup({}, commit=MockCommit()) + runtime.setup(None, commit=MockCommit()) assert runtime.model == 22 # A join event updates the model and sets it to -10. From 0346a3c96b57d0c06dbaea582bf5280ea484b72e Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 15:11:57 -0600 Subject: [PATCH 39/49] Docstrings --- .../replays/consumers/buffered/consumer.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index f554538fa189fc..040298ee917cba 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,4 +1,15 @@ -"""Session Replay recording consumer implementation.""" +"""Session Replay recording consumer implementation. + +This module has two parts. A processing component and a buffer flushing component. The processing +component is straight-forward. It accepts a message and performs some work on it. After it +completes it instructs the runtime to append the message to the buffer. This is abstracted by the +buffering runtime library so we just return the transformed data in this module. + +The second part is the flushing of the buffer. The buffering runtime library has no idea when to +flush this buffer so it constantly asks us if it can flush. We control flushing behavior through a +stateful "BufferManager" class. If we can_flush then we do_flush. After the flush completes the +RunTime will commit the offsets. +""" import contextlib import time @@ -28,18 +39,9 @@ class Flags(TypedDict): class BufferManager: """Buffer manager. - Determines if and when a buffer should flush. The buffer accepts only one argument (`flags`) - which can contain any number of configuration options. Currently we only care about the time - the buffer has been active and the length of the buffer. But we could update this function to - extract an option thats concerned with the bytesize of the buffer if we thought that was a - useful metric for committing. - The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its methods as callbacks to the Model. The state contained within the method's instance is implicit - and unknown to the RunTime. We could model this state inside the RunTime but the state is - simple enough that I don't feel the need to over-engineer the buffering RunTime. For more - complex use-cases you would want to formalize state transformations in the RunTime. Especially - if you wanted to expose the state across more locations in the application. + and unknown to the RunTime. """ def __init__(self, flags: Flags) -> None: @@ -50,6 +52,8 @@ def __init__(self, flags: Flags) -> None: self.__last_flushed_at = time.time() def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: + # TODO: time.time is stateful and hard to test. We should enable the RunTime to perform + # managed effects so we can properly test this behavior. return ( len(model.buffer) >= self.__max_buffer_length or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at @@ -58,8 +62,7 @@ def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: with sentry_tracing("replays.consumers.buffered.flush_buffer"): flush_buffer(model, max_workers=self.__max_workers) - # Update the buffer manager with the new time so we don't continuously commit in a - # loop! + # TODO: time.time again. Should be declarative for testing purposes. self.__last_flushed_at = time.time() @@ -78,6 +81,8 @@ def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> N for future in done: exc = future.exception() if exc is not None: + # TODO: Why raise? Can I do something more meaningful here than reject the whole + # batch? Raising is certainly the easiest way of handling failures... raise exc # Recording metadata is not tracked in the threadpool. This is because this function will From 4ab0ff25dc11201d828e196edee24d92bb1cbb26 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 15:16:47 -0600 Subject: [PATCH 40/49] More docstrings --- src/sentry/replays/consumers/buffered/consumer.py | 3 +++ src/sentry/replays/consumers/buffered/lib.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 040298ee917cba..35a1b450ef90e4 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,5 +1,8 @@ """Session Replay recording consumer implementation. +To understand how the buffering works visit the `lib.py` module and inspect the source of the +buffering runtime. + This module has two parts. A processing component and a buffer flushing component. The processing component is straight-forward. It accepts a message and performs some work on it. After it completes it instructs the runtime to append the message to the buffer. This is abstracted by the diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py index fa6d7f4b0dded8..9945c1253407ed 100644 --- a/src/sentry/replays/consumers/buffered/lib.py +++ b/src/sentry/replays/consumers/buffered/lib.py @@ -16,7 +16,6 @@ Commit, Flags, Join, - Nothing, Poll, RunTime, Sub, @@ -45,6 +44,8 @@ class Append(Generic[Item]): @dataclass(frozen=True) class AppendOffset: + """Update the offsets; no item needs to be appended to the buffer.""" + offset: MutableMapping[Partition, int] @@ -67,6 +68,8 @@ class Polled: class TryFlush: + """Instruct the application to flush the buffer if its time.""" + pass @@ -91,7 +94,7 @@ def init( init_fn: Callable[[Flags], Model[Item]], flags: Flags, ) -> tuple[Model[Item], Cmd[Msg[Item]]]: - return (init_fn(flags), Nothing()) + return (init_fn(flags), None) def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Item]] | None]: From 77c2971fd6f57ee6899db2425fc7bc26e676809b Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 28 Feb 2025 15:19:16 -0600 Subject: [PATCH 41/49] Yet more docstrings --- src/sentry/replays/consumers/buffered/factory.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py index 036ee4868342ad..2d90798f1ee210 100644 --- a/src/sentry/replays/consumers/buffered/factory.py +++ b/src/sentry/replays/consumers/buffered/factory.py @@ -1,3 +1,9 @@ +"""Session Replay recording consumer strategy factory. + +This module exists solely to abstract the bootstraping process of the application and runtime in +`sentry/consumers/__init__.py`. +""" + from collections.abc import Mapping from arroyo.backends.kafka.consumer import KafkaPayload From b956ad9f42f1378ad54bdcd417f2c3e30f99e3c6 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 08:06:38 -0600 Subject: [PATCH 42/49] Implement declarative effect management --- .../replays/consumers/buffered/consumer.py | 241 ++++++++----- .../replays/consumers/buffered/factory.py | 4 +- src/sentry/replays/consumers/buffered/lib.py | 143 -------- .../replays/consumers/buffered/platform.py | 102 ++++-- .../unit/consumers/test_buffering_runtime.py | 186 ---------- .../replays/unit/consumers/test_helpers.py | 22 +- .../replays/unit/consumers/test_recording.py | 336 +++++++++++++++--- 7 files changed, 528 insertions(+), 506 deletions(-) delete mode 100644 src/sentry/replays/consumers/buffered/lib.py delete mode 100644 tests/sentry/replays/unit/consumers/test_buffering_runtime.py diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 35a1b450ef90e4..fe5448563e6155 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,37 +1,37 @@ -"""Session Replay recording consumer implementation. - -To understand how the buffering works visit the `lib.py` module and inspect the source of the -buffering runtime. - -This module has two parts. A processing component and a buffer flushing component. The processing -component is straight-forward. It accepts a message and performs some work on it. After it -completes it instructs the runtime to append the message to the buffer. This is abstracted by the -buffering runtime library so we just return the transformed data in this module. - -The second part is the flushing of the buffer. The buffering runtime library has no idea when to -flush this buffer so it constantly asks us if it can flush. We control flushing behavior through a -stateful "BufferManager" class. If we can_flush then we do_flush. After the flush completes the -RunTime will commit the offsets. -""" +"""Session Replay recording consumer implementation.""" import contextlib import time +from collections.abc import MutableMapping from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait +from dataclasses import dataclass from typing import TypedDict import sentry_sdk - -from sentry.replays.consumers.buffered.lib import Model, buffering_runtime +from arroyo.types import Partition + +from sentry.replays.consumers.buffered.platform import ( + Cmd, + Commit, + Effect, + Join, + Nothing, + Poll, + RunTime, + Sub, + Task, +) from sentry.replays.usecases.ingest import ( DropSilently, ProcessedRecordingMessage, commit_recording_message, parse_recording_message, process_recording_message, - sentry_tracing, track_recording_metadata, ) +# Types. + class Flags(TypedDict): max_buffer_length: int @@ -39,90 +39,163 @@ class Flags(TypedDict): max_workers: int -class BufferManager: - """Buffer manager. +@dataclass +class Model: + buffer: list[ProcessedRecordingMessage] + last_flushed_at: float + max_buffer_length: int + max_buffer_wait: int + max_workers: int + offsets: MutableMapping[Partition, int] + + +@dataclass(frozen=True) +class Append: + """Append the item to the buffer and update the offsets.""" - The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its - methods as callbacks to the Model. The state contained within the method's instance is implicit - and unknown to the RunTime. - """ + item: ProcessedRecordingMessage + offset: MutableMapping[Partition, int] - def __init__(self, flags: Flags) -> None: - self.__max_buffer_length = flags["max_buffer_length"] - self.__max_buffer_wait = flags["max_buffer_wait"] - self.__max_workers = flags["max_workers"] - self.__last_flushed_at = time.time() +@dataclass(frozen=True) +class AppendOffset: + """Update the offsets; no item needs to be appended to the buffer.""" - def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool: - # TODO: time.time is stateful and hard to test. We should enable the RunTime to perform - # managed effects so we can properly test this behavior. - return ( - len(model.buffer) >= self.__max_buffer_length - or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at - ) + offset: MutableMapping[Partition, int] - def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None: - with sentry_tracing("replays.consumers.buffered.flush_buffer"): - flush_buffer(model, max_workers=self.__max_workers) - # TODO: time.time again. Should be declarative for testing purposes. - self.__last_flushed_at = time.time() +class Committed: + """The platform committed offsets. Our buffer is now completely done.""" -@sentry_sdk.trace -def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None: - if len(model.buffer) == 0: - return None - with ThreadPoolExecutor(max_workers=max_workers) as pool: - futures = [pool.submit(flush_message, message) for message in model.buffer] +class Flush: + """Our application hit the flush threshold and has been instructed to flush.""" - # Tasks can fail. We check the done set for any failures. We will wait for all the - # futures to complete before running this step or eagerly run this step if any task - # errors. - done, _ = wait(futures, return_when=FIRST_EXCEPTION) - for future in done: - exc = future.exception() - if exc is not None: - # TODO: Why raise? Can I do something more meaningful here than reject the whole - # batch? Raising is certainly the easiest way of handling failures... - raise exc - # Recording metadata is not tracked in the threadpool. This is because this function will - # log. Logging will acquire a lock and make our threading less useful due to the speed of - # the I/O we do in this step. - for message in model.buffer: - track_recording_metadata(message) +@dataclass(frozen=True) +class Flushed: + """Our application successfully flushed.""" - return None + now: float -@sentry_sdk.trace -def flush_message(message: ProcessedRecordingMessage) -> None: - with contextlib.suppress(DropSilently): - commit_recording_message(message) +@dataclass(frozen=True) +class TryFlush: + """Instruct the application to flush the buffer if its time.""" + + now: float -def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None: - """Message processing function. +# A "Msg" is the union of all application messages our RunTime will accept. +Msg = Append | AppendOffset | Committed | Flush | Flushed | TryFlush - Accepts an unstructured type and returns a structured one. Other than tracing the goal is to - have no I/O here. We'll commit the I/O on flush. - """ - with sentry_tracing("replays.consumers.buffered.process_message"): - with contextlib.suppress(DropSilently): - message = parse_recording_message(message_bytes) - return process_recording_message(message) - return None +# State machine functions. -def init(flags: Flags) -> Model[ProcessedRecordingMessage]: - """Return the initial state of the application.""" - buffer = BufferManager(flags) - return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={}) +def init(flags: Flags) -> tuple[Model, Cmd[Msg] | None]: + return ( + Model( + buffer=[], + last_flushed_at=time.time(), + max_buffer_wait=flags["max_buffer_wait"], + max_workers=flags["max_workers"], + max_buffer_length=flags["max_buffer_length"], + offsets={}, + ), + None, + ) -recording_runtime = buffering_runtime( - init_fn=init, - process_fn=process_message, + +@sentry_sdk.trace +def process(_: Model, message: bytes, offset: MutableMapping[Partition, int]) -> Msg | None: + try: + item = process_recording_message(parse_recording_message(message)) + return Append(item=item, offset=offset) + except Exception: + return AppendOffset(offset=offset) + + +def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg] | None]: + match msg: + case Append(item=item, offset=offset): + model.buffer.append(item) + model.offsets.update(offset) + return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now))) + case AppendOffset(offset=offset): + model.offsets.update(offset) + return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now))) + case Committed(): + return (model, None) + case Flush(): + return (model, Effect(fun=FlushBuffer(model), msg=lambda now: Flushed(now=now))) + case Flushed(now=now): + model.buffer = [] + model.last_flushed_at = now + return (model, Commit(msg=Committed(), offsets=model.offsets)) + case TryFlush(now=now): + return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing()) + + +def subscription(model: Model) -> list[Sub[Msg]]: + return [ + Join(msg=Flush), + Poll(msg=lambda: TryFlush(now=time.time())), + ] + + +# Helpers. + + +def can_flush(model: Model, now: float) -> bool: + return ( + len(model.buffer) >= model.max_buffer_length + or (now - model.max_buffer_wait) >= model.last_flushed_at + ) + + +@dataclass(frozen=True) +class FlushBuffer: + model: Model + + def __call__(self) -> float: + @sentry_sdk.trace + def flush_message(message: ProcessedRecordingMessage) -> None: + with contextlib.suppress(DropSilently): + commit_recording_message(message) + + if len(self.model.buffer) == 0: + return time.time() + + with ThreadPoolExecutor(max_workers=self.model.max_workers) as pool: + futures = [pool.submit(flush_message, message) for message in self.model.buffer] + + # Tasks can fail. We check the done set for any failures. We will wait for all the + # futures to complete before running this step or eagerly run this step if any task + # errors. + done, _ = wait(futures, return_when=FIRST_EXCEPTION) + for future in done: + exc = future.exception() + # Raising preserves the existing behavior. We can address error handling in a + # follow up. + if exc is not None and not isinstance(exc, DropSilently): + raise exc + + # Recording metadata is not tracked in the threadpool. This is because this function will + # log. Logging will acquire a lock and make our threading less useful due to the speed of + # the I/O we do in this step. + for message in self.model.buffer: + track_recording_metadata(message) + + return time.time() + + +# Consumer. + + +recording_consumer = RunTime( + init=init, + process=process, + subscription=subscription, + update=update, ) diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py index 2d90798f1ee210..56566d449d2053 100644 --- a/src/sentry/replays/consumers/buffered/factory.py +++ b/src/sentry/replays/consumers/buffered/factory.py @@ -11,7 +11,7 @@ from arroyo.types import Commit as ArroyoCommit from arroyo.types import Partition -from sentry.replays.consumers.buffered.consumer import Flags, recording_runtime +from sentry.replays.consumers.buffered.consumer import Flags, recording_consumer from sentry.replays.consumers.buffered.platform import PlatformStrategy @@ -29,4 +29,4 @@ def create_with_partitions( commit: ArroyoCommit, partitions: Mapping[Partition, int], ) -> ProcessingStrategy[KafkaPayload]: - return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_runtime) + return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_consumer) diff --git a/src/sentry/replays/consumers/buffered/lib.py b/src/sentry/replays/consumers/buffered/lib.py deleted file mode 100644 index 9945c1253407ed..00000000000000 --- a/src/sentry/replays/consumers/buffered/lib.py +++ /dev/null @@ -1,143 +0,0 @@ -"""Buffered RunTime implementation. - -The Buffering RunTime eagerly processes messages as they are received and waits until its buffer -is full before flushing those messages. The goal being to achieve efficiencies from batched I/O. -""" - -from collections.abc import Callable, MutableMapping -from dataclasses import dataclass -from functools import partial -from typing import Generic, TypeVar - -from arroyo.types import Partition - -from sentry.replays.consumers.buffered.platform import ( - Cmd, - Commit, - Flags, - Join, - Poll, - RunTime, - Sub, - Task, -) - -Item = TypeVar("Item") -T = TypeVar("T") - - -@dataclass -class Model(Generic[Item]): - buffer: list[Item] - can_flush: Callable[["Model[Item]"], bool] - do_flush: Callable[["Model[Item]"], None] - offsets: MutableMapping[Partition, int] - - -@dataclass(frozen=True) -class Append(Generic[Item]): - """Append the item to the buffer and update the offsets.""" - - item: Item - offset: MutableMapping[Partition, int] - - -@dataclass(frozen=True) -class AppendOffset: - """Update the offsets; no item needs to be appended to the buffer.""" - - offset: MutableMapping[Partition, int] - - -class Committed: - """The platform committed offsets. Our buffer is now completely done.""" - - pass - - -class Flush: - """Our application hit the flush threshold and has been instructed to flush.""" - - pass - - -class Polled: - """Our application was polled by the platform.""" - - pass - - -class TryFlush: - """Instruct the application to flush the buffer if its time.""" - - pass - - -# A "Msg" is the union of all application messages our RunTime will accept. -Msg = Append[Item] | AppendOffset | Committed | Flush | Polled | TryFlush - - -def process( - process_fn: Callable[[bytes], Item | None], - model: Model[Item], - message: bytes, - offset: MutableMapping[Partition, int], -) -> Msg[Item]: - item = process_fn(message) - if item: - return Append(item=item, offset=offset) - else: - return AppendOffset(offset=offset) - - -def init( - init_fn: Callable[[Flags], Model[Item]], - flags: Flags, -) -> tuple[Model[Item], Cmd[Msg[Item]]]: - return (init_fn(flags), None) - - -def update(model: Model[Item], msg: Msg[Item]) -> tuple[Model[Item], Cmd[Msg[Item]] | None]: - match msg: - case Append(item=item, offset=offset): - model.buffer.append(item) - model.offsets.update(offset) - return (model, Task(msg=TryFlush())) - case AppendOffset(offset=offset): - model.offsets.update(offset) - return (model, Task(msg=TryFlush())) - case Committed(): - return (model, None) - case Flush(): - model.do_flush(model) - model.buffer = [] - return (model, Commit(msg=Committed(), offsets=model.offsets)) - case Polled(): - if model.can_flush(model): - return (model, Task(msg=Flush())) - else: - return (model, None) - case TryFlush(): - if model.can_flush(model): - return (model, Task(msg=Flush())) - else: - return (model, None) - - -def subscription(model: Model[Item]) -> list[Sub[Msg[Item]]]: - return [ - Join(msg=Flush()), - Poll(msg=Polled()), - ] - - -def buffering_runtime( - init_fn: Callable[[Flags], Model[Item]], - process_fn: Callable[[bytes], Item | None], -) -> RunTime[Model[Item], Msg[Item], Flags]: - return RunTime( - init=partial(init, init_fn), - process=partial(process, process_fn), - subscription=subscription, - update=update, - ) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index f1819d6ee7fe7c..6e92252e5f684a 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -1,7 +1,7 @@ -from collections.abc import Callable, Mapping, MutableMapping +from collections.abc import Callable, MutableMapping from dataclasses import dataclass from datetime import datetime -from typing import Generic, TypeVar +from typing import Any, Generic, TypeVar, cast from arroyo.backends.kafka.consumer import KafkaPayload from arroyo.processing.strategies import CommitOffsets, ProcessingStrategy @@ -25,7 +25,7 @@ def __init__( self.__closed = False self.__runtime = runtime - def handle_commit_request(self, offsets: Mapping[Partition, int]) -> None: + def handle_commit_request(self, offsets: MutableMapping[Partition, int]) -> None: self.__commit_step.submit(Message(Value(None, offsets, datetime.now()))) def submit(self, message: Message[KafkaPayload]) -> None: @@ -51,7 +51,7 @@ def join(self, timeout: float | None = None) -> None: self.__commit_step.join(timeout) -CommitProtocol = Callable[[Mapping[Partition, int]], None] +CommitProtocol = Callable[[MutableMapping[Partition, int]], None] # A Model represents the state of your application. It is a type variable and the RunTime is # generic over it. Your state can be anything from a simple integer to a large class with many @@ -62,8 +62,12 @@ def join(self, timeout: float | None = None) -> None: # and optionally issue commands to the RunTime. Msg = TypeVar("Msg") +# A generic type representing the structure of the flags passed to the RunTime instance. Flags = TypeVar("Flags") +# A generic type of unknown origins not tied to anything in the platform. +T = TypeVar("T") + @dataclass(frozen=True) class Commit(Generic[Msg]): @@ -78,10 +82,22 @@ class Commit(Generic[Msg]): offsets: MutableMapping[Partition, int] -class Nothing: - """Instructs the RunTime to do nothing. Equivalent to a null command.""" +@dataclass(frozen=True) +class Effect(Generic[Msg]): + """Instructs the RunTime to perform a managed effect. - pass + If the RunTime performs an effect for the application it means the RunTime can dictate if the + effect blocks the application, if the effect executes at all, or perform any additional + operations before or after the effect. This has significant implications for RunTime + performance and application testability. + """ + + fun: Callable[[], Any] + msg: Callable[[Any], Msg] + + +class Nothing: + """Instructs the RunTime to do nothing.""" @dataclass(frozen=True) @@ -94,7 +110,7 @@ class Task(Generic[Msg]): # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not # be registered on a per application basis. -Cmd = Commit[Msg] | Nothing | Task[Msg] +Cmd = Commit[Msg] | Effect[Msg] | Task[Msg] @dataclass(frozen=True) @@ -106,7 +122,7 @@ class Join(Generic[Msg]): events and handle them in the way they see fit. """ - msg: Msg + msg: Callable[[], Msg] name = "join" @@ -118,7 +134,7 @@ class Poll(Generic[Msg]): these events and choose to act on them. """ - msg: Msg + msg: Callable[[], Msg] name = "poll" @@ -142,7 +158,7 @@ class RunTime(Generic[Model, Msg, Flags]): def __init__( self, init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]], - process: Callable[[Model, bytes, Mapping[Partition, int]], Msg | None], + process: Callable[[Model, bytes, MutableMapping[Partition, int]], Msg | None], subscription: Callable[[Model], list[Sub[Msg]]], update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]], ) -> None: @@ -151,41 +167,47 @@ def __init__( self.subscription = subscription self.update = update - self.__commit: CommitProtocol | None = None - self.__model: Model | None = None - self.__subscriptions: dict[str, Sub[Msg]] = {} + self._commit: CommitProtocol | None = None + self._model: Model | None = None + self._subscriptions: dict[str, Sub[Msg]] = {} @property def commit(self) -> CommitProtocol: - assert self.__commit is not None - return self.__commit + assert self._commit is not None + return self._commit @property def model(self) -> Model: - assert self.__model is not None - return self.__model + assert self._model is not None + return self._model + # NOTE: Could this be a factory function that produces RunTimes instead? That way we don't need + # the assert checks on model and commit. def setup(self, flags: Flags, commit: CommitProtocol) -> None: - # NOTE: Could this be a factory function that produces RunTimes instead? That way we - # don't need the assert checks on model and commit. - self.__commit = commit + self._commit = commit model, cmd = self.init(flags) - self.__model = model - self.__handle_cmd(cmd) - self.__register_subscriptions() + self._model = model + self._handle_cmd(cmd) + self._register_subscriptions() def submit(self, message: Message[KafkaPayload]) -> None: - self.__handle_msg(self.process(self.model, message.payload.value, message.committable)) - - def publish(self, sub_name: str) -> None: + self._handle_msg( + self.process( + self.model, + message.payload.value, + cast(MutableMapping[Partition, int], message.committable), + ) + ) + + def publish(self, sub_name: str): # For each new subscription event we re-register the subscribers in case anything within # the application has changed. I.e. the model is in some new state and that means we care # about a new subscription or don't care about an old one. - self.__register_subscriptions() + self._register_subscriptions() # Using the subscription's name look for the subscription in the registry. - sub = self.__subscriptions.get(sub_name) + sub = self._subscriptions.get(sub_name) if sub is None: return None @@ -194,29 +216,31 @@ def publish(self, sub_name: str) -> None: # subscriptions might do more complex things. match sub: case Join(msg=msg): - return self.__handle_msg(msg) + return self._handle_msg(msg()) case Poll(msg=msg): - return self.__handle_msg(msg) + return self._handle_msg(msg()) - def __handle_msg(self, msg: Msg | None) -> None: + def _handle_msg(self, msg: Msg | None) -> None: if msg: model, cmd = self.update(self.model, msg) - self.__model = model - self.__handle_cmd(cmd) + self._model = model + self._handle_cmd(cmd) - def __handle_cmd(self, cmd: Cmd[Msg] | None) -> None: + def _handle_cmd(self, cmd: Cmd[Msg] | None) -> None: if cmd is None: return None match cmd: case Commit(msg=msg, offsets=offsets): self.commit(offsets) - return self.__handle_msg(msg) + return self._handle_msg(msg) + case Effect(msg=msg, fun=fun): + return self._handle_msg(msg(fun())) case Nothing(): return None case Task(msg=msg): - return self.__handle_msg(msg) + return self._handle_msg(msg) - def __register_subscriptions(self) -> None: + def _register_subscriptions(self) -> None: for sub in self.subscription(self.model): - self.__subscriptions[sub.name] = sub + self._subscriptions[sub.name] = sub diff --git a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py b/tests/sentry/replays/unit/consumers/test_buffering_runtime.py deleted file mode 100644 index ce65f9ed076318..00000000000000 --- a/tests/sentry/replays/unit/consumers/test_buffering_runtime.py +++ /dev/null @@ -1,186 +0,0 @@ -import pytest -from arroyo.types import Partition, Topic - -from sentry.replays.consumers.buffered.lib import Model, buffering_runtime -from tests.sentry.replays.unit.consumers.test_helpers import ( - MockCommit, - MockSink, - make_kafka_message, -) - - -def buffer_runtime(sink): - return buffering_runtime( - init_fn=lambda _: Model( - # The buffer is an arbitrary list of items but it could be anything. For example a - # string which you could perpetually append to. - buffer=[], - # Commit when the buffer has 5 messages in it. - can_flush=lambda m: len(m.buffer) == 5, - # Flush in this case is just pushing the buffer into a mock class which holds the - # messages in memory. This is easy for us to test and we don't need to mock out any - # service code which is both unknown to the RunTime and can be tested independently. - do_flush=lambda m: sink.accept(m.buffer), - # This is the offsets tracking object. The RunTime manages appending. - offsets={}, - ), - # Our process function is a simple lambda which checks that the message is a valid - # integer and is less than 5. There's no exception handling here. That could crash the - # consumer! Its up to the application developer to manage. - process_fn=lambda m: int(m) if int(m) < 5 else None, - ) - - -def test_runtime_setup(): - """Test RunTime initialization.""" - runtime = buffer_runtime(MockSink()) - runtime.setup({}, commit=MockCommit()) - assert runtime.model.buffer == [] - assert runtime.model.offsets == {} - - -def test_buffering_runtime_submit(): - """Test message recived from the platform. - - Messages are buffered up until a point (see the RunTime definition at the top of the module). - When we cross the threshold the RunTime should order a buffer flush action. - """ - mock_commit = MockCommit() - sink = MockSink() - - runtime = buffer_runtime(sink) - runtime.setup({}, mock_commit) - assert runtime.model.buffer == [] - assert runtime.model.offsets == {} - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert three valid messages buffered. - runtime.submit(make_kafka_message(b"1")) - runtime.submit(make_kafka_message(b"1")) - runtime.submit(make_kafka_message(b"1")) - assert runtime.model.buffer == [1, 1, 1] - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert two invalid messages not buffered. - runtime.submit(make_kafka_message(b"5")) - runtime.submit(make_kafka_message(b"5")) - assert runtime.model.buffer == [1, 1, 1] - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert an additional message is buffered. - runtime.submit(make_kafka_message(b"2")) - assert runtime.model.buffer == [1, 1, 1, 2] - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert the buffer is flushed to the sink. - runtime.submit(make_kafka_message(b"3")) - assert runtime.model.buffer == [] - assert sink.accepted == [1, 1, 1, 2, 3] - assert mock_commit.commit == {Partition(Topic("a"), 1): 2} - - -def test_buffering_runtime_submit_invalid_message(): - """Test invalid message recived from the platform.""" - mock_commit = MockCommit() - sink = MockSink() - - runtime = buffer_runtime(sink) - runtime.setup({}, mock_commit) - assert runtime.model.buffer == [] - assert runtime.model.offsets == {} - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert exceptional behavior is not handled. - with pytest.raises(ValueError): - runtime.submit(make_kafka_message(b"hello")) - - -def test_buffering_runtime_join(): - """Test the RunTime received a join message from the platform. - - When the consumer is ordered to shutdown a "join" message will be submitted to the RunTime. - When that happens we rush to flush the buffer and commit whatever we can. - """ - mock_commit = MockCommit() - sink = MockSink() - - runtime = buffer_runtime(sink) - runtime.setup({}, mock_commit) - assert runtime.model.buffer == [] - assert runtime.model.offsets == {} - assert sink.accepted == [] - - # Assert three valid messages buffered. - runtime.submit(make_kafka_message(b"1")) - runtime.submit(make_kafka_message(b"1")) - runtime.submit(make_kafka_message(b"1")) - assert runtime.model.buffer == [1, 1, 1] - assert sink.accepted == [] - - # Assert join subscriptions eagerly flush the buffer. - runtime.publish("join") - assert runtime.model.buffer == [] - assert sink.accepted == [1, 1, 1] - assert mock_commit.commit == {Partition(Topic("a"), 1): 2} - - -def test_buffering_runtime_poll(): - """Test the RunTime received a poll message from the platform. - - This will happen periodically. In the event the consumer does not receive messages for some - interval the poll message will be published will be called. - """ - mock_commit = MockCommit() - sink = MockSink() - - runtime = buffer_runtime(sink) - runtime.setup({}, mock_commit) - assert runtime.model.buffer == [] - assert runtime.model.offsets == {} - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert poll does not automatically empty the buffer. - runtime.submit(make_kafka_message(b"1")) - runtime.publish("poll") - assert runtime.model.buffer == [1] - assert sink.accepted == [] - - # Dirty mutation warning! We're forcing ourselves to be in a flushable state by mutating the - # state outside the explicit state machine interfaces. - runtime.model.buffer = [1, 2, 3, 4, 5] - assert runtime.model.buffer == [1, 2, 3, 4, 5] - assert sink.accepted == [] - - # Assert poll flushes if the buffer is ready. - runtime.publish("poll") - assert runtime.model.buffer == [] - assert sink.accepted == [1, 2, 3, 4, 5] - assert mock_commit.commit == {Partition(Topic("a"), 1): 2} - - -def test_buffering_runtime_committed_offsets(): - """Test offset commit.""" - mock_commit = MockCommit() - sink = MockSink() - - runtime = buffer_runtime(sink) - runtime.setup({}, mock_commit) - assert runtime.model.buffer == [] - assert runtime.model.offsets == {} - assert sink.accepted == [] - assert mock_commit.commit == {} - - # Assert committed offsets are correct. - runtime.submit(make_kafka_message(b"1")) - runtime.submit(make_kafka_message(b"10", topic="b")) - runtime.submit(make_kafka_message(b"100", offset=2)) - assert runtime.model.buffer == [1] - assert runtime.model.offsets == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2} - runtime.publish("join") - assert mock_commit.commit == {Partition(Topic("a"), 1): 3, Partition(Topic("b"), 1): 2} diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py index 3543885e55b07b..79663b114e78ce 100644 --- a/tests/sentry/replays/unit/consumers/test_helpers.py +++ b/tests/sentry/replays/unit/consumers/test_helpers.py @@ -1,9 +1,12 @@ -from collections.abc import Mapping +from collections.abc import Mapping, MutableMapping from datetime import datetime +from typing import cast from arroyo.backends.kafka import KafkaPayload from arroyo.types import BrokerValue, Message, Partition, Topic +from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, RunTime + class MockCommit: def __init__(self): @@ -13,6 +16,23 @@ def __call__(self, offsets: Mapping[Partition, int], force: bool = False) -> Non self.commit.update(offsets) +class MockRunTime(RunTime[Model, Msg, Flags]): + def _handle_msg(self, msg): + while True: + model, cmd = self.update(self.model, msg) + self._model = model + msg = yield cmd + + def submit(self, message): + yield from self._handle_msg( + self.process( + self.model, + message.payload.value, + cast(MutableMapping[Partition, int], message.committable), + ) + ) + + class MockSink: def __init__(self): self.accepted = [] diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index fbbb9f50839a2c..b4ad74b8b2c062 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -2,76 +2,150 @@ import uuid import zlib -import msgpack from arroyo.types import Partition from arroyo.types import Topic as ArroyoTopic from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording from sentry.conf.types.kafka_definition import Topic, get_topic_codec -from sentry.replays.consumers.buffered.consumer import process_message, recording_runtime +from sentry.replays.consumers.buffered.consumer import ( + Committed, + Flush, + FlushBuffer, + TryFlush, + init, + process, + subscription, + update, +) +from sentry.replays.consumers.buffered.platform import Commit, Effect, Nothing, Task from sentry.replays.usecases.ingest import ProcessedRecordingMessage from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta -from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message +from tests.sentry.replays.unit.consumers.test_helpers import ( + MockCommit, + MockRunTime, + make_kafka_message, +) RECORDINGS_CODEC = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) -def test_process_message(): - result = process_message( - msgpack.packb( - { - "type": "replay_recording_not_chunked", - "replay_id": uuid.uuid4().hex, - "org_id": 1, - "key_id": 3, - "project_id": 2, - "received": int(time.time()), - "retention_days": 30, - "payload": b'{"segment_id":0}\n[]', - "replay_event": None, - "replay_video": None, - } +def test_end_to_end_message_processing(): + """End to end test of the recording consumer.""" + runtime = _make_runtime() + + message: ReplayRecording = { + "key_id": None, + "org_id": 1, + "payload": b'{"segment_id":0}\n[]', + "project_id": 1, + "received": int(time.time()), + "replay_event": None, + "replay_id": uuid.uuid4().hex, + "replay_video": None, + "retention_days": 30, + "type": "replay_recording_not_chunked", + "version": 1, + } + message_bytes = RECORDINGS_CODEC.encode(message) + kafka_message = make_kafka_message(message_bytes) + + gen = runtime.submit(kafka_message) + + # Assert the runtime gets the current time after appending the message and then attempts to + # flush the buffer with the current time. + cmd = next(gen) + assert isinstance(cmd, Effect) + assert cmd.fun == time.time + assert cmd.msg(1) == TryFlush(now=1) + assert runtime.model.buffer == [ + ProcessedRecordingMessage( + actions_event=ParsedEventMeta([], [], [], [], [], []), + filedata=zlib.compress(b"[]"), + filename=runtime.model.buffer[0].filename, + is_replay_video=False, + key_id=None, + org_id=1, + project_id=1, + received=message["received"], + recording_size_uncompressed=2, + recording_size=runtime.model.buffer[0].recording_size, + retention_days=30, + replay_id=message["replay_id"], + segment_id=0, + video_size=None, + replay_event=None, ) - ) + ] - expected_event_metadata = ParsedEventMeta([], [], [], [], [], []) - assert result is not None - assert result == ProcessedRecordingMessage( - actions_event=expected_event_metadata, - filedata=zlib.compress(b"[]"), - filename=result.filename, - is_replay_video=False, - key_id=3, - org_id=1, - project_id=2, - received=result.received, - recording_size_uncompressed=2, - recording_size=result.recording_size, - retention_days=30, - replay_id=result.replay_id, - segment_id=0, - video_size=None, - replay_event=None, - ) + # Give the runtime timestamps that are too early to flush (including the one the runtime wanted + # to generate). + assert isinstance(gen.send(TryFlush(now=1)), Nothing) + assert isinstance(gen.send(TryFlush(now=2)), Nothing) + assert isinstance(gen.send(TryFlush(now=3)), Nothing) + assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) + # Give the runtime a timestamp that will trigger a flush. + cmd = gen.send(TryFlush(now=time.time() + 1)) + assert isinstance(cmd, Task) + assert isinstance(cmd.msg, Flush) -def test_process_message_invalid(): - result = process_message(msgpack.packb(b"hello, world!")) - assert result is None + # Assert the runtime triggered a buffer flush and forward the next msg to the runtime. + cmd = gen.send(cmd.msg) + assert isinstance(cmd, Effect) + assert cmd.fun == FlushBuffer(runtime.model) + assert len(runtime.model.buffer) == 1 + # Assert the successful flush triggers a commit command. Assert the model's offsets were + # committed and the buffer was reset. + cmd = gen.send(cmd.msg(1)) + assert len(runtime.model.buffer) == 0 + assert runtime.model.last_flushed_at == 1 + assert isinstance(cmd, Commit) + assert isinstance(cmd.msg, Committed) + assert cmd.offsets == runtime.model.offsets + assert cmd.offsets == {Partition(topic=ArroyoTopic(name="a"), index=1): 2} -def test_commit_invalid_message(): - """Assert invalid messages have their offsets staged for commit.""" - mock_commit = MockCommit() - recording_runtime.setup( - {"max_buffer_length": 100, "max_buffer_wait": 100, "max_workers": 1}, mock_commit - ) +def test_invalid_message_format(): + """Test message with invalid message format.""" + runtime = _make_runtime() + + # We submit a message which can't be parsed and will not be buffered. Flush is not triggered. + gen = runtime.submit(make_kafka_message(b"invalid")) + cmd = next(gen) + assert len(runtime.model.buffer) == 0 + assert len(runtime.model.offsets) == 1 + assert isinstance(cmd, Effect) + assert cmd.fun == time.time + assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) + + # Trigger a flush by submitting a time that exceeds the window. + cmd = gen.send(TryFlush(now=time.time() + 1)) + assert isinstance(cmd, Task) + assert isinstance(cmd.msg, Flush) + + # Send the flush message the runtime provided and assert that it produces a FlushBuffer + # effect. + cmd = gen.send(cmd.msg) + assert isinstance(cmd, Effect) + assert cmd.fun == FlushBuffer(runtime.model) + + # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid + # message is never revisited. + cmd = gen.send(cmd.msg(1)) + assert runtime.model.last_flushed_at == 1 + assert len(runtime.model.buffer) == 0 + assert len(runtime.model.offsets) == 1 # offsets are retained + + +def test_invalid_recording_json(): + """Test message with invalid recording JSON.""" + runtime = _make_runtime() message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b"invalid", # type: ignore[typeddict-item] + "payload": b'{"segment_id":0}\n[', "project_id": 1, "received": int(time.time()), "replay_event": None, @@ -81,10 +155,170 @@ def test_commit_invalid_message(): "type": "replay_recording_not_chunked", "version": 1, } + message_bytes = RECORDINGS_CODEC.encode(message) + kafka_message = make_kafka_message(message_bytes) + + # We submit a message which will not be buffered. Flush is not triggered. + gen = runtime.submit(kafka_message) + cmd = next(gen) + assert len(runtime.model.buffer) == 1 + assert len(runtime.model.offsets) == 1 + assert isinstance(cmd, Effect) + assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) + + # Trigger a flush by submitting a time that exceeds the window. + cmd = gen.send(TryFlush(now=time.time() + 1)) + assert isinstance(cmd, Task) + assert isinstance(cmd.msg, Flush) + # Send the flush message the runtime provided and assert that it produces a FlushBuffer + # effect. + cmd = gen.send(cmd.msg) + assert isinstance(cmd, Effect) + assert cmd.fun == FlushBuffer(runtime.model) + + # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid + # message is never revisited. + cmd = gen.send(cmd.msg(1)) + assert runtime.model.last_flushed_at == 1 + assert len(runtime.model.buffer) == 0 + assert len(runtime.model.offsets) == 1 # offsets are retained + + +def test_missing_headers(): + """Test message with missing headers.""" + runtime = _make_runtime() + + message: ReplayRecording = { + "key_id": None, + "org_id": 1, + "payload": b"[]", + "project_id": 1, + "received": int(time.time()), + "replay_event": None, + "replay_id": uuid.uuid4().hex, + "replay_video": None, + "retention_days": 30, + "type": "replay_recording_not_chunked", + "version": 1, + } message_bytes = RECORDINGS_CODEC.encode(message) kafka_message = make_kafka_message(message_bytes) - recording_runtime.submit(kafka_message) - assert recording_runtime.model.offsets == {Partition(ArroyoTopic("a"), 1): 2} - assert mock_commit.commit == {} + # We submit a message which will not be buffered. Flush is not triggered. + gen = runtime.submit(kafka_message) + cmd = next(gen) + assert len(runtime.model.buffer) == 0 + assert len(runtime.model.offsets) == 1 + assert isinstance(cmd, Effect) + assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) + + # Trigger a flush by submitting a time that exceeds the window. + cmd = gen.send(TryFlush(now=time.time() + 1)) + assert isinstance(cmd, Task) + assert isinstance(cmd.msg, Flush) + + # Send the flush message the runtime provided and assert that it produces a FlushBuffer + # effect. + cmd = gen.send(cmd.msg) + assert isinstance(cmd, Effect) + assert cmd.fun == FlushBuffer(runtime.model) + + # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid + # message is never revisited. + cmd = gen.send(cmd.msg(1)) + assert runtime.model.last_flushed_at == 1 + assert len(runtime.model.buffer) == 0 + assert len(runtime.model.offsets) == 1 # offsets are retained + + +def test_buffer_full_semantics(): + runtime = _make_runtime() + + message: ReplayRecording = { + "key_id": None, + "org_id": 1, + "payload": b'{"segment_id":0}\n[]', + "project_id": 1, + "received": int(time.time()), + "replay_event": None, + "replay_id": uuid.uuid4().hex, + "replay_video": None, + "retention_days": 30, + "type": "replay_recording_not_chunked", + "version": 1, + } + message_bytes = RECORDINGS_CODEC.encode(message) + kafka_message = make_kafka_message(message_bytes) + + # We submit a message which will be buffered but not flushed. + gen = runtime.submit(kafka_message) + cmd = next(gen) + assert isinstance(cmd, Effect) + assert cmd.fun == time.time + + # Assert the TryFlush msg produced by the runtime had no effect because the buffer was not full + # and the wait interval was not exceeded. + assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) + + # We submit another message which will be buffered and trigger a flush. + gen = runtime.submit(kafka_message) + cmd = next(gen) + assert isinstance(cmd, Effect) + assert cmd.fun == time.time + assert cmd.msg(1) == TryFlush(now=1) + + # Assert a flush command is triggered from the msg produced by the runtime. + cmd = gen.send(cmd.msg(cmd.fun())) + assert isinstance(cmd, Task) + assert isinstance(cmd.msg, Flush) + + +def test_buffer_timeout(): + runtime = _make_runtime() + + message: ReplayRecording = { + "key_id": None, + "org_id": 1, + "payload": b'{"segment_id":0}\n[]', + "project_id": 1, + "received": int(time.time()), + "replay_event": None, + "replay_id": uuid.uuid4().hex, + "replay_video": None, + "retention_days": 30, + "type": "replay_recording_not_chunked", + "version": 1, + } + message_bytes = RECORDINGS_CODEC.encode(message) + kafka_message = make_kafka_message(message_bytes) + + # We submit a message which will be buffered but not flushed. + gen = runtime.submit(kafka_message) + cmd = next(gen) + assert isinstance(cmd, Effect) + assert cmd.fun == time.time + assert cmd.msg(1) == TryFlush(now=1) + + # Assert the TryFlush msg produced by the runtime had no effect because the wait interval was + # not exceeded. + assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) + + # Now we emit a new TryFlush message with a timestamp in the future. This triggers the runtime + # to flush because its flush interval has been exceeded. + cmd = gen.send(TryFlush(now=time.time() + 1)) + assert isinstance(cmd, Task) + assert isinstance(cmd.msg, Flush) + + +def _make_runtime(): + runtime = MockRunTime(init, process, subscription, update) + runtime.setup( + { + "max_buffer_length": 2, + "max_buffer_wait": 1, + "max_workers": 1, + }, + MockCommit(), + ) + return runtime From fa4200412df0995ba90cb85428c12bf0b11c5851 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 08:34:41 -0600 Subject: [PATCH 43/49] Move offset management into the runtime --- .../replays/consumers/buffered/consumer.py | 42 +++++++------------ .../replays/consumers/buffered/platform.py | 24 ++++------- .../replays/unit/consumers/test_helpers.py | 11 +---- .../replays/unit/consumers/test_recording.py | 16 +------ 4 files changed, 27 insertions(+), 66 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index fe5448563e6155..ed5484c408b9cf 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -2,13 +2,11 @@ import contextlib import time -from collections.abc import MutableMapping from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait from dataclasses import dataclass from typing import TypedDict import sentry_sdk -from arroyo.types import Partition from sentry.replays.consumers.buffered.platform import ( Cmd, @@ -46,22 +44,13 @@ class Model: max_buffer_length: int max_buffer_wait: int max_workers: int - offsets: MutableMapping[Partition, int] @dataclass(frozen=True) class Append: - """Append the item to the buffer and update the offsets.""" + """Append the item to the buffer.""" item: ProcessedRecordingMessage - offset: MutableMapping[Partition, int] - - -@dataclass(frozen=True) -class AppendOffset: - """Update the offsets; no item needs to be appended to the buffer.""" - - offset: MutableMapping[Partition, int] class Committed: @@ -79,6 +68,10 @@ class Flushed: now: float +class Skip: + """Skip the message.""" + + @dataclass(frozen=True) class TryFlush: """Instruct the application to flush the buffer if its time.""" @@ -87,13 +80,13 @@ class TryFlush: # A "Msg" is the union of all application messages our RunTime will accept. -Msg = Append | AppendOffset | Committed | Flush | Flushed | TryFlush +Msg = Append | Committed | Flush | Flushed | TryFlush # State machine functions. -def init(flags: Flags) -> tuple[Model, Cmd[Msg] | None]: +def init(flags: Flags) -> tuple[Model, Cmd[Msg]]: return ( Model( buffer=[], @@ -101,38 +94,35 @@ def init(flags: Flags) -> tuple[Model, Cmd[Msg] | None]: max_buffer_wait=flags["max_buffer_wait"], max_workers=flags["max_workers"], max_buffer_length=flags["max_buffer_length"], - offsets={}, ), - None, + Nothing(), ) @sentry_sdk.trace -def process(_: Model, message: bytes, offset: MutableMapping[Partition, int]) -> Msg | None: +def process(_: Model, message: bytes) -> Msg: try: item = process_recording_message(parse_recording_message(message)) - return Append(item=item, offset=offset) + return Append(item=item) except Exception: - return AppendOffset(offset=offset) + return Skip() -def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg] | None]: +def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg]]: match msg: - case Append(item=item, offset=offset): + case Append(item=item): model.buffer.append(item) - model.offsets.update(offset) return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now))) - case AppendOffset(offset=offset): - model.offsets.update(offset) + case Skip(): return (model, Effect(fun=time.time, msg=lambda now: TryFlush(now=now))) case Committed(): - return (model, None) + return (model, Nothing()) case Flush(): return (model, Effect(fun=FlushBuffer(model), msg=lambda now: Flushed(now=now))) case Flushed(now=now): model.buffer = [] model.last_flushed_at = now - return (model, Commit(msg=Committed(), offsets=model.offsets)) + return (model, Commit(msg=Committed())) case TryFlush(now=now): return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing()) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index 6e92252e5f684a..c6b8ddd35de51a 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -71,15 +71,9 @@ def join(self, timeout: float | None = None) -> None: @dataclass(frozen=True) class Commit(Generic[Msg]): - """Instructs the RunTime to commit the message offsets. - - Because the RunTime is based on a Kafka platform there is an expectation that the application - should trigger commits when necessary. While the application can't be sure its running Kafka - it can be sure its running on a Platform that requires offsets be committed. - """ + """Instructs the RunTime to commit the message offsets.""" msg: Msg - offsets: MutableMapping[Partition, int] @dataclass(frozen=True) @@ -158,7 +152,7 @@ class RunTime(Generic[Model, Msg, Flags]): def __init__( self, init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]], - process: Callable[[Model, bytes, MutableMapping[Partition, int]], Msg | None], + process: Callable[[Model, bytes], Msg | None], subscription: Callable[[Model], list[Sub[Msg]]], update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]], ) -> None: @@ -169,6 +163,7 @@ def __init__( self._commit: CommitProtocol | None = None self._model: Model | None = None + self._offsets: MutableMapping[Partition, int] = {} self._subscriptions: dict[str, Sub[Msg]] = {} @property @@ -192,13 +187,8 @@ def setup(self, flags: Flags, commit: CommitProtocol) -> None: self._register_subscriptions() def submit(self, message: Message[KafkaPayload]) -> None: - self._handle_msg( - self.process( - self.model, - message.payload.value, - cast(MutableMapping[Partition, int], message.committable), - ) - ) + self._handle_msg(self.process(self.model, message.payload.value)) + self._offsets = cast(MutableMapping[Partition, int], message.committable) def publish(self, sub_name: str): # For each new subscription event we re-register the subscribers in case anything within @@ -231,8 +221,8 @@ def _handle_cmd(self, cmd: Cmd[Msg] | None) -> None: return None match cmd: - case Commit(msg=msg, offsets=offsets): - self.commit(offsets) + case Commit(msg=msg): + self.commit(self._offsets) return self._handle_msg(msg) case Effect(msg=msg, fun=fun): return self._handle_msg(msg(fun())) diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py index 79663b114e78ce..c0201feebc81da 100644 --- a/tests/sentry/replays/unit/consumers/test_helpers.py +++ b/tests/sentry/replays/unit/consumers/test_helpers.py @@ -1,6 +1,5 @@ -from collections.abc import Mapping, MutableMapping +from collections.abc import Mapping from datetime import datetime -from typing import cast from arroyo.backends.kafka import KafkaPayload from arroyo.types import BrokerValue, Message, Partition, Topic @@ -24,13 +23,7 @@ def _handle_msg(self, msg): msg = yield cmd def submit(self, message): - yield from self._handle_msg( - self.process( - self.model, - message.payload.value, - cast(MutableMapping[Partition, int], message.committable), - ) - ) + yield from self._handle_msg(self.process(self.model, message.payload.value)) class MockSink: diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index b4ad74b8b2c062..54f3468b3280b0 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -2,8 +2,6 @@ import uuid import zlib -from arroyo.types import Partition -from arroyo.types import Topic as ArroyoTopic from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording from sentry.conf.types.kafka_definition import Topic, get_topic_codec @@ -102,8 +100,6 @@ def test_end_to_end_message_processing(): assert runtime.model.last_flushed_at == 1 assert isinstance(cmd, Commit) assert isinstance(cmd.msg, Committed) - assert cmd.offsets == runtime.model.offsets - assert cmd.offsets == {Partition(topic=ArroyoTopic(name="a"), index=1): 2} def test_invalid_message_format(): @@ -114,7 +110,6 @@ def test_invalid_message_format(): gen = runtime.submit(make_kafka_message(b"invalid")) cmd = next(gen) assert len(runtime.model.buffer) == 0 - assert len(runtime.model.offsets) == 1 assert isinstance(cmd, Effect) assert cmd.fun == time.time assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) @@ -135,7 +130,6 @@ def test_invalid_message_format(): cmd = gen.send(cmd.msg(1)) assert runtime.model.last_flushed_at == 1 assert len(runtime.model.buffer) == 0 - assert len(runtime.model.offsets) == 1 # offsets are retained def test_invalid_recording_json(): @@ -162,7 +156,6 @@ def test_invalid_recording_json(): gen = runtime.submit(kafka_message) cmd = next(gen) assert len(runtime.model.buffer) == 1 - assert len(runtime.model.offsets) == 1 assert isinstance(cmd, Effect) assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) @@ -177,12 +170,10 @@ def test_invalid_recording_json(): assert isinstance(cmd, Effect) assert cmd.fun == FlushBuffer(runtime.model) - # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid - # message is never revisited. + # Send the `Flushed` message with an arbitrary timestamp. cmd = gen.send(cmd.msg(1)) assert runtime.model.last_flushed_at == 1 assert len(runtime.model.buffer) == 0 - assert len(runtime.model.offsets) == 1 # offsets are retained def test_missing_headers(): @@ -209,7 +200,6 @@ def test_missing_headers(): gen = runtime.submit(kafka_message) cmd = next(gen) assert len(runtime.model.buffer) == 0 - assert len(runtime.model.offsets) == 1 assert isinstance(cmd, Effect) assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) @@ -224,12 +214,10 @@ def test_missing_headers(): assert isinstance(cmd, Effect) assert cmd.fun == FlushBuffer(runtime.model) - # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid - # message is never revisited. + # Send the `Flushed` message with an arbitrary timestamp. cmd = gen.send(cmd.msg(1)) assert runtime.model.last_flushed_at == 1 assert len(runtime.model.buffer) == 0 - assert len(runtime.model.offsets) == 1 # offsets are retained def test_buffer_full_semantics(): From 2215007a153265b15ec3d7b0da2d6d02c196745f Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 08:40:01 -0600 Subject: [PATCH 44/49] Fix typing --- src/sentry/replays/consumers/buffered/consumer.py | 2 +- src/sentry/replays/consumers/buffered/platform.py | 4 ++-- tests/sentry/replays/unit/consumers/test_recording.py | 10 +++++----- tests/sentry/replays/unit/consumers/test_runtime.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index ed5484c408b9cf..5beb1ccba52310 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -80,7 +80,7 @@ class TryFlush: # A "Msg" is the union of all application messages our RunTime will accept. -Msg = Append | Committed | Flush | Flushed | TryFlush +Msg = Append | Committed | Flush | Flushed | Skip | TryFlush # State machine functions. diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index c6b8ddd35de51a..2f0c12990966da 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -104,7 +104,7 @@ class Task(Generic[Msg]): # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not # be registered on a per application basis. -Cmd = Commit[Msg] | Effect[Msg] | Task[Msg] +Cmd = Commit[Msg] | Effect[Msg] | Nothing | Task[Msg] @dataclass(frozen=True) @@ -190,7 +190,7 @@ def submit(self, message: Message[KafkaPayload]) -> None: self._handle_msg(self.process(self.model, message.payload.value)) self._offsets = cast(MutableMapping[Partition, int], message.committable) - def publish(self, sub_name: str): + def publish(self, sub_name: str) -> None: # For each new subscription event we re-register the subscribers in case anything within # the application has changed. I.e. the model is in some new state and that means we care # about a new subscription or don't care about an old one. diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 54f3468b3280b0..d69ea1eb8e22aa 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -34,7 +34,7 @@ def test_end_to_end_message_processing(): message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b'{"segment_id":0}\n[]', + "payload": b'{"segment_id":0}\n[]', # type: ignore[typeddict-item] "project_id": 1, "received": int(time.time()), "replay_event": None, @@ -139,7 +139,7 @@ def test_invalid_recording_json(): message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b'{"segment_id":0}\n[', + "payload": b'{"segment_id":0}\n[', # type: ignore[typeddict-item] "project_id": 1, "received": int(time.time()), "replay_event": None, @@ -183,7 +183,7 @@ def test_missing_headers(): message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b"[]", + "payload": b"[]", # type: ignore[typeddict-item] "project_id": 1, "received": int(time.time()), "replay_event": None, @@ -226,7 +226,7 @@ def test_buffer_full_semantics(): message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b'{"segment_id":0}\n[]', + "payload": b'{"segment_id":0}\n[]', # type: ignore[typeddict-item] "project_id": 1, "received": int(time.time()), "replay_event": None, @@ -268,7 +268,7 @@ def test_buffer_timeout(): message: ReplayRecording = { "key_id": None, "org_id": 1, - "payload": b'{"segment_id":0}\n[]', + "payload": b'{"segment_id":0}\n[]', # type: ignore[typeddict-item] "project_id": 1, "received": int(time.time()), "replay_event": None, diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py index c8ecfdbefabb64..f66e4b1fa70303 100644 --- a/tests/sentry/replays/unit/consumers/test_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_runtime.py @@ -6,7 +6,7 @@ def counter_runtime() -> RunTime[int, str, None]: def init(_): return (22, None) - def process(_model, message, _offsets): + def process(_model, message): if message == b"incr": return "incr" elif message == b"decr": @@ -28,8 +28,8 @@ def update(model, msg): def subscription(_): return [ - Join(msg="join"), - Poll(msg="poll"), + Join(msg=lambda: "join"), + Poll(msg=lambda: "poll"), ] return RunTime( From 4cc15d78687ff05b34330ba715e3b4a9803bc720 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 08:51:54 -0600 Subject: [PATCH 45/49] Remove comments on offsets --- tests/sentry/replays/unit/consumers/test_recording.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index d69ea1eb8e22aa..a66b3c252f5515 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -93,8 +93,7 @@ def test_end_to_end_message_processing(): assert cmd.fun == FlushBuffer(runtime.model) assert len(runtime.model.buffer) == 1 - # Assert the successful flush triggers a commit command. Assert the model's offsets were - # committed and the buffer was reset. + # Assert the successful flush triggers a commit command. cmd = gen.send(cmd.msg(1)) assert len(runtime.model.buffer) == 0 assert runtime.model.last_flushed_at == 1 @@ -125,8 +124,7 @@ def test_invalid_message_format(): assert isinstance(cmd, Effect) assert cmd.fun == FlushBuffer(runtime.model) - # Send the `Flushed` message with an arbitrary timestamp. Offsets are retained so the invalid - # message is never revisited. + # Send the `Flushed` message with an arbitrary timestamp. cmd = gen.send(cmd.msg(1)) assert runtime.model.last_flushed_at == 1 assert len(runtime.model.buffer) == 0 From 4063f5d67d491f5390bd9567bf926e7a668dabef Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 08:58:51 -0600 Subject: [PATCH 46/49] Remove none-type messages --- .../replays/consumers/buffered/platform.py | 20 ++++++++----------- .../replays/unit/consumers/test_runtime.py | 14 +++++++------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index 2f0c12990966da..300f6ce1a29002 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -151,10 +151,10 @@ class RunTime(Generic[Model, Msg, Flags]): def __init__( self, - init: Callable[[Flags], tuple[Model, Cmd[Msg] | None]], - process: Callable[[Model, bytes], Msg | None], + init: Callable[[Flags], tuple[Model, Cmd[Msg]]], + process: Callable[[Model, bytes], Msg], subscription: Callable[[Model], list[Sub[Msg]]], - update: Callable[[Model, Msg], tuple[Model, Cmd[Msg] | None]], + update: Callable[[Model, Msg], tuple[Model, Cmd[Msg]]], ) -> None: self.init = init self.process = process @@ -210,16 +210,12 @@ def publish(self, sub_name: str) -> None: case Poll(msg=msg): return self._handle_msg(msg()) - def _handle_msg(self, msg: Msg | None) -> None: - if msg: - model, cmd = self.update(self.model, msg) - self._model = model - self._handle_cmd(cmd) - - def _handle_cmd(self, cmd: Cmd[Msg] | None) -> None: - if cmd is None: - return None + def _handle_msg(self, msg: Msg) -> None: + model, cmd = self.update(self.model, msg) + self._model = model + self._handle_cmd(cmd) + def _handle_cmd(self, cmd: Cmd[Msg]) -> None: match cmd: case Commit(msg=msg): self.commit(self._offsets) diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py index f66e4b1fa70303..ed7933fc116ec9 100644 --- a/tests/sentry/replays/unit/consumers/test_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_runtime.py @@ -1,4 +1,4 @@ -from sentry.replays.consumers.buffered.platform import Join, Poll, RunTime +from sentry.replays.consumers.buffered.platform import Join, Nothing, Poll, RunTime from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message @@ -12,17 +12,19 @@ def process(_model, message): elif message == b"decr": return "decr" else: - return None + return "nothing" def update(model, msg): if msg == "incr": - return (model + 1, None) + return (model + 1, Nothing()) elif msg == "decr": - return (model - 1, None) + return (model - 1, Nothing()) elif msg == "join": - return (-10, None) + return (-10, Nothing()) elif msg == "poll": - return (99, None) + return (99, Nothing()) + elif msg == "nothing": + return (model, Nothing()) else: raise ValueError("Unknown msg") From ea64a5083fbff0a4c210fd4b6eb5fe84659dbf7b Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 15:53:26 -0600 Subject: [PATCH 47/49] Move offsets out of runtime and into platform strategy. Add support for arbitrary next step --- .../replays/consumers/buffered/consumer.py | 9 +- .../replays/consumers/buffered/platform.py | 106 ++++++++++-------- .../replays/unit/consumers/test_helpers.py | 36 ++---- .../replays/unit/consumers/test_recording.py | 31 ++--- .../replays/unit/consumers/test_runtime.py | 24 ++-- 5 files changed, 99 insertions(+), 107 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index 5beb1ccba52310..ecb90fdd9eb4bb 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -10,10 +10,11 @@ from sentry.replays.consumers.buffered.platform import ( Cmd, - Commit, Effect, Join, + NextStep, Nothing, + Out, Poll, RunTime, Sub, @@ -86,7 +87,7 @@ class TryFlush: # State machine functions. -def init(flags: Flags) -> tuple[Model, Cmd[Msg]]: +def init(flags: Flags) -> tuple[Model, Cmd[Msg, Out]]: return ( Model( buffer=[], @@ -108,7 +109,7 @@ def process(_: Model, message: bytes) -> Msg: return Skip() -def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg]]: +def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, Out]]: match msg: case Append(item=item): model.buffer.append(item) @@ -122,7 +123,7 @@ def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg]]: case Flushed(now=now): model.buffer = [] model.last_flushed_at = now - return (model, Commit(msg=Committed())) + return (model, NextStep(msg=Committed(), value=None)) case TryFlush(now=now): return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing()) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index 300f6ce1a29002..3d7939710e879a 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -1,57 +1,75 @@ from collections.abc import Callable, MutableMapping from dataclasses import dataclass from datetime import datetime -from typing import Any, Generic, TypeVar, cast +from typing import Any, Generic, TypeVar from arroyo.backends.kafka.consumer import KafkaPayload -from arroyo.processing.strategies import CommitOffsets, ProcessingStrategy -from arroyo.types import Commit as ArroyoCommit -from arroyo.types import Message, Partition, Value +from arroyo.processing.strategies import MessageRejected, ProcessingStrategy +from arroyo.types import FilteredPayload, Message, Partition, Value +Out = TypeVar("Out") +Input = FilteredPayload | KafkaPayload -class PlatformStrategy(ProcessingStrategy[KafkaPayload]): + +class PlatformStrategy(ProcessingStrategy[Input], Generic[Out]): def __init__( self, - commit: ArroyoCommit, flags: "Flags", - runtime: "RunTime[Model, Msg, Flags]", + runtime: "RunTime[Model, Msg, Flags, Out]", + next_step: "ProcessingStrategy[Out]", ) -> None: # The RunTime is made aware of the commit strategy. It will # submit the partition, offset mapping it wants committed. - runtime.setup(flags, self.handle_commit_request) + runtime.setup(flags, self._handle_next_step) - self.__commit_step = CommitOffsets(commit) self.__closed = False + self.__next_step = next_step + self.__offsets: MutableMapping[Partition, int] = {} self.__runtime = runtime - def handle_commit_request(self, offsets: MutableMapping[Partition, int]) -> None: - self.__commit_step.submit(Message(Value(None, offsets, datetime.now()))) - - def submit(self, message: Message[KafkaPayload]) -> None: + # NOTE: Filtered payloads update the offsets but are not forwarded to the next step. My + # concern is that the filtered payload could have its offsets committed before the + # preceeding messages have their offsets committed. This is against what the Arroyo library + # does which is to forward everything. + def submit(self, message: Message[FilteredPayload | KafkaPayload]) -> None: assert not self.__closed - self.__runtime.submit(message) + + if isinstance(message.payload, KafkaPayload): + self.__runtime.submit(message.payload.value) + self.__offsets.update(message.committable) + else: + self.__offsets.update(message.committable) def poll(self) -> None: assert not self.__closed - self.__runtime.publish("poll") - self.__commit_step.poll() + + try: + self.__runtime.publish("poll") + except MessageRejected: + pass + + self.__next_step.poll() def close(self) -> None: self.__closed = True - self.__commit_step.close() def terminate(self) -> None: self.__closed = True - self.__commit_step.terminate() + self.__next_step.terminate() def join(self, timeout: float | None = None) -> None: - self.__runtime.publish("join") - self.__commit_step.close() - self.__commit_step.join(timeout) + try: + self.__runtime.publish("join") + except MessageRejected: + pass + + self.__next_step.close() + self.__next_step.join(timeout) + def _handle_next_step(self, value: Out) -> None: + self.__next_step.submit(Message(Value(value, self.__offsets, datetime.now()))) -CommitProtocol = Callable[[MutableMapping[Partition, int]], None] # A Model represents the state of your application. It is a type variable and the RunTime is # generic over it. Your state can be anything from a simple integer to a large class with many @@ -65,15 +83,13 @@ def join(self, timeout: float | None = None) -> None: # A generic type representing the structure of the flags passed to the RunTime instance. Flags = TypeVar("Flags") -# A generic type of unknown origins not tied to anything in the platform. -T = TypeVar("T") - @dataclass(frozen=True) -class Commit(Generic[Msg]): - """Instructs the RunTime to commit the message offsets.""" +class NextStep(Generic[Msg, Out]): + """Instructs the RunTime to produce to the next step.""" msg: Msg + value: Out @dataclass(frozen=True) @@ -104,7 +120,7 @@ class Task(Generic[Msg]): # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not # be registered on a per application basis. -Cmd = Commit[Msg] | Effect[Msg] | Nothing | Task[Msg] +Cmd = NextStep[Msg, Out] | Effect[Msg] | Nothing | Task[Msg] @dataclass(frozen=True) @@ -139,7 +155,7 @@ class Poll(Generic[Msg]): Sub = Join[Msg] | Poll[Msg] -class RunTime(Generic[Model, Msg, Flags]): +class RunTime(Generic[Model, Msg, Flags, Out]): """RunTime object. The RunTime is an intermediate data structure which manages communication between the platform @@ -151,44 +167,42 @@ class RunTime(Generic[Model, Msg, Flags]): def __init__( self, - init: Callable[[Flags], tuple[Model, Cmd[Msg]]], + init: Callable[[Flags], tuple[Model, Cmd[Msg, Out]]], process: Callable[[Model, bytes], Msg], subscription: Callable[[Model], list[Sub[Msg]]], - update: Callable[[Model, Msg], tuple[Model, Cmd[Msg]]], + update: Callable[[Model, Msg], tuple[Model, Cmd[Msg, Out]]], ) -> None: self.init = init self.process = process self.subscription = subscription self.update = update - self._commit: CommitProtocol | None = None + self._next_step: Callable[[Out], None] | None = None self._model: Model | None = None - self._offsets: MutableMapping[Partition, int] = {} self._subscriptions: dict[str, Sub[Msg]] = {} - @property - def commit(self) -> CommitProtocol: - assert self._commit is not None - return self._commit - @property def model(self) -> Model: assert self._model is not None return self._model + @property + def next_step(self) -> Callable[[Out], None]: + assert self._next_step is not None + return self._next_step + # NOTE: Could this be a factory function that produces RunTimes instead? That way we don't need # the assert checks on model and commit. - def setup(self, flags: Flags, commit: CommitProtocol) -> None: - self._commit = commit + def setup(self, flags: Flags, next_step: Callable[[Out], None]) -> None: + self._next_step = next_step model, cmd = self.init(flags) self._model = model self._handle_cmd(cmd) self._register_subscriptions() - def submit(self, message: Message[KafkaPayload]) -> None: - self._handle_msg(self.process(self.model, message.payload.value)) - self._offsets = cast(MutableMapping[Partition, int], message.committable) + def submit(self, message: bytes) -> None: + self._handle_msg(self.process(self.model, message)) def publish(self, sub_name: str) -> None: # For each new subscription event we re-register the subscribers in case anything within @@ -215,10 +229,10 @@ def _handle_msg(self, msg: Msg) -> None: self._model = model self._handle_cmd(cmd) - def _handle_cmd(self, cmd: Cmd[Msg]) -> None: + def _handle_cmd(self, cmd: Cmd[Msg, Out]) -> None: match cmd: - case Commit(msg=msg): - self.commit(self._offsets) + case NextStep(msg=msg, value=value): + self.next_step(value) return self._handle_msg(msg) case Effect(msg=msg, fun=fun): return self._handle_msg(msg(fun())) diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py index c0201feebc81da..9d7743f9ab50a2 100644 --- a/tests/sentry/replays/unit/consumers/test_helpers.py +++ b/tests/sentry/replays/unit/consumers/test_helpers.py @@ -1,21 +1,7 @@ -from collections.abc import Mapping -from datetime import datetime +from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, Out, RunTime -from arroyo.backends.kafka import KafkaPayload -from arroyo.types import BrokerValue, Message, Partition, Topic -from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, RunTime - - -class MockCommit: - def __init__(self): - self.commit = {} - - def __call__(self, offsets: Mapping[Partition, int], force: bool = False) -> None: - self.commit.update(offsets) - - -class MockRunTime(RunTime[Model, Msg, Flags]): +class MockRunTime(RunTime[Model, Msg, Flags, Out]): def _handle_msg(self, msg): while True: model, cmd = self.update(self.model, msg) @@ -23,7 +9,15 @@ def _handle_msg(self, msg): msg = yield cmd def submit(self, message): - yield from self._handle_msg(self.process(self.model, message.payload.value)) + yield from self._handle_msg(self.process(self.model, message)) + + +class MockNextStep: + def __init__(self): + self.values = [] + + def __call__(self, value): + self.values.append(value) class MockSink: @@ -32,11 +26,3 @@ def __init__(self): def accept(self, buffer): self.accepted.extend(buffer) - - -def make_kafka_message(message: bytes, topic: str = "a", index: int = 1, offset: int = 1): - return Message( - BrokerValue( - KafkaPayload(b"k", message, []), Partition(Topic(topic), index), offset, datetime.now() - ) - ) diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index a66b3c252f5515..32d5b847d6a3e8 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -15,14 +15,10 @@ subscription, update, ) -from sentry.replays.consumers.buffered.platform import Commit, Effect, Nothing, Task +from sentry.replays.consumers.buffered.platform import Effect, NextStep, Nothing, Task from sentry.replays.usecases.ingest import ProcessedRecordingMessage from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta -from tests.sentry.replays.unit.consumers.test_helpers import ( - MockCommit, - MockRunTime, - make_kafka_message, -) +from tests.sentry.replays.unit.consumers.test_helpers import MockNextStep, MockRunTime RECORDINGS_CODEC = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS) @@ -45,9 +41,8 @@ def test_end_to_end_message_processing(): "version": 1, } message_bytes = RECORDINGS_CODEC.encode(message) - kafka_message = make_kafka_message(message_bytes) - gen = runtime.submit(kafka_message) + gen = runtime.submit(message_bytes) # Assert the runtime gets the current time after appending the message and then attempts to # flush the buffer with the current time. @@ -97,7 +92,7 @@ def test_end_to_end_message_processing(): cmd = gen.send(cmd.msg(1)) assert len(runtime.model.buffer) == 0 assert runtime.model.last_flushed_at == 1 - assert isinstance(cmd, Commit) + assert isinstance(cmd, NextStep) assert isinstance(cmd.msg, Committed) @@ -106,7 +101,7 @@ def test_invalid_message_format(): runtime = _make_runtime() # We submit a message which can't be parsed and will not be buffered. Flush is not triggered. - gen = runtime.submit(make_kafka_message(b"invalid")) + gen = runtime.submit(b"invalid") cmd = next(gen) assert len(runtime.model.buffer) == 0 assert isinstance(cmd, Effect) @@ -148,10 +143,9 @@ def test_invalid_recording_json(): "version": 1, } message_bytes = RECORDINGS_CODEC.encode(message) - kafka_message = make_kafka_message(message_bytes) # We submit a message which will not be buffered. Flush is not triggered. - gen = runtime.submit(kafka_message) + gen = runtime.submit(message_bytes) cmd = next(gen) assert len(runtime.model.buffer) == 1 assert isinstance(cmd, Effect) @@ -192,10 +186,9 @@ def test_missing_headers(): "version": 1, } message_bytes = RECORDINGS_CODEC.encode(message) - kafka_message = make_kafka_message(message_bytes) # We submit a message which will not be buffered. Flush is not triggered. - gen = runtime.submit(kafka_message) + gen = runtime.submit(message_bytes) cmd = next(gen) assert len(runtime.model.buffer) == 0 assert isinstance(cmd, Effect) @@ -235,10 +228,9 @@ def test_buffer_full_semantics(): "version": 1, } message_bytes = RECORDINGS_CODEC.encode(message) - kafka_message = make_kafka_message(message_bytes) # We submit a message which will be buffered but not flushed. - gen = runtime.submit(kafka_message) + gen = runtime.submit(message_bytes) cmd = next(gen) assert isinstance(cmd, Effect) assert cmd.fun == time.time @@ -248,7 +240,7 @@ def test_buffer_full_semantics(): assert isinstance(gen.send(cmd.msg(cmd.fun())), Nothing) # We submit another message which will be buffered and trigger a flush. - gen = runtime.submit(kafka_message) + gen = runtime.submit(message_bytes) cmd = next(gen) assert isinstance(cmd, Effect) assert cmd.fun == time.time @@ -277,10 +269,9 @@ def test_buffer_timeout(): "version": 1, } message_bytes = RECORDINGS_CODEC.encode(message) - kafka_message = make_kafka_message(message_bytes) # We submit a message which will be buffered but not flushed. - gen = runtime.submit(kafka_message) + gen = runtime.submit(message_bytes) cmd = next(gen) assert isinstance(cmd, Effect) assert cmd.fun == time.time @@ -305,6 +296,6 @@ def _make_runtime(): "max_buffer_wait": 1, "max_workers": 1, }, - MockCommit(), + MockNextStep(), ) return runtime diff --git a/tests/sentry/replays/unit/consumers/test_runtime.py b/tests/sentry/replays/unit/consumers/test_runtime.py index ed7933fc116ec9..151141c387ed96 100644 --- a/tests/sentry/replays/unit/consumers/test_runtime.py +++ b/tests/sentry/replays/unit/consumers/test_runtime.py @@ -1,8 +1,8 @@ from sentry.replays.consumers.buffered.platform import Join, Nothing, Poll, RunTime -from tests.sentry.replays.unit.consumers.test_helpers import MockCommit, make_kafka_message +from tests.sentry.replays.unit.consumers.test_helpers import MockNextStep -def counter_runtime() -> RunTime[int, str, None]: +def counter_runtime() -> RunTime[int, str, None, None]: def init(_): return (22, None) @@ -44,37 +44,37 @@ def subscription(_): def test_runtime_setup(): runtime = counter_runtime() - runtime.setup(None, commit=MockCommit()) + runtime.setup(None, next_step=MockNextStep()) assert runtime.model == 22 def test_runtime_submit(): # RunTime defaults to a start point of 22. runtime = counter_runtime() - runtime.setup(None, commit=MockCommit()) + runtime.setup(None, next_step=MockNextStep()) assert runtime.model == 22 # Two incr commands increase the count by 2. - runtime.submit(make_kafka_message(b"incr")) - runtime.submit(make_kafka_message(b"incr")) + runtime.submit(b"incr") + runtime.submit(b"incr") assert runtime.model == 24 # Four decr commands decrease the count by 4. - runtime.submit(make_kafka_message(b"decr")) - runtime.submit(make_kafka_message(b"decr")) - runtime.submit(make_kafka_message(b"decr")) - runtime.submit(make_kafka_message(b"decr")) + runtime.submit(b"decr") + runtime.submit(b"decr") + runtime.submit(b"decr") + runtime.submit(b"decr") assert runtime.model == 20 # Messages which the application does not understand do nothing to the model. - runtime.submit(make_kafka_message(b"other")) + runtime.submit(b"other") assert runtime.model == 20 def test_runtime_publish(): # RunTime defaults to a start point of 22. runtime = counter_runtime() - runtime.setup(None, commit=MockCommit()) + runtime.setup(None, next_step=MockNextStep()) assert runtime.model == 22 # A join event updates the model and sets it to -10. From 62d74337ab21a327c1dd59078bddd8caeec76c3c Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 16:07:28 -0600 Subject: [PATCH 48/49] Fix typing --- .../replays/consumers/buffered/consumer.py | 5 ++- .../replays/consumers/buffered/factory.py | 10 +++--- .../replays/consumers/buffered/platform.py | 32 +++++++++---------- .../replays/unit/consumers/test_helpers.py | 4 +-- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index ecb90fdd9eb4bb..ed93363489b403 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -14,7 +14,6 @@ Join, NextStep, Nothing, - Out, Poll, RunTime, Sub, @@ -87,7 +86,7 @@ class TryFlush: # State machine functions. -def init(flags: Flags) -> tuple[Model, Cmd[Msg, Out]]: +def init(flags: Flags) -> tuple[Model, Cmd[Msg, None]]: return ( Model( buffer=[], @@ -109,7 +108,7 @@ def process(_: Model, message: bytes) -> Msg: return Skip() -def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, Out]]: +def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, None]]: match msg: case Append(item=item): model.buffer.append(item) diff --git a/src/sentry/replays/consumers/buffered/factory.py b/src/sentry/replays/consumers/buffered/factory.py index 56566d449d2053..5cc3ba59bbecea 100644 --- a/src/sentry/replays/consumers/buffered/factory.py +++ b/src/sentry/replays/consumers/buffered/factory.py @@ -8,8 +8,8 @@ from arroyo.backends.kafka.consumer import KafkaPayload from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory -from arroyo.types import Commit as ArroyoCommit -from arroyo.types import Partition +from arroyo.processing.strategies.commit import CommitOffsets +from arroyo.types import Commit, Partition from sentry.replays.consumers.buffered.consumer import Flags, recording_consumer from sentry.replays.consumers.buffered.platform import PlatformStrategy @@ -26,7 +26,9 @@ def __init__(self, max_buffer_length: int, max_buffer_wait: int, max_workers: in def create_with_partitions( self, - commit: ArroyoCommit, + commit: Commit, partitions: Mapping[Partition, int], ) -> ProcessingStrategy[KafkaPayload]: - return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_consumer) + return PlatformStrategy( + next_step=CommitOffsets(commit), flags=self.flags, runtime=recording_consumer + ) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index 3d7939710e879a..6639cade87da25 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -7,17 +7,17 @@ from arroyo.processing.strategies import MessageRejected, ProcessingStrategy from arroyo.types import FilteredPayload, Message, Partition, Value -Out = TypeVar("Out") -Input = FilteredPayload | KafkaPayload +Output = TypeVar("Output") +Input = KafkaPayload -class PlatformStrategy(ProcessingStrategy[Input], Generic[Out]): +class PlatformStrategy(ProcessingStrategy[FilteredPayload | Input], Generic[Output]): def __init__( self, flags: "Flags", - runtime: "RunTime[Model, Msg, Flags, Out]", - next_step: "ProcessingStrategy[Out]", + runtime: "RunTime[Model, Msg, Flags, Output]", + next_step: "ProcessingStrategy[Output]", ) -> None: # The RunTime is made aware of the commit strategy. It will # submit the partition, offset mapping it wants committed. @@ -67,7 +67,7 @@ def join(self, timeout: float | None = None) -> None: self.__next_step.close() self.__next_step.join(timeout) - def _handle_next_step(self, value: Out) -> None: + def _handle_next_step(self, value: Output) -> None: self.__next_step.submit(Message(Value(value, self.__offsets, datetime.now()))) @@ -85,11 +85,11 @@ def _handle_next_step(self, value: Out) -> None: @dataclass(frozen=True) -class NextStep(Generic[Msg, Out]): +class NextStep(Generic[Msg, Output]): """Instructs the RunTime to produce to the next step.""" msg: Msg - value: Out + value: Output @dataclass(frozen=True) @@ -120,7 +120,7 @@ class Task(Generic[Msg]): # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not # be registered on a per application basis. -Cmd = NextStep[Msg, Out] | Effect[Msg] | Nothing | Task[Msg] +Cmd = NextStep[Msg, Output] | Effect[Msg] | Nothing | Task[Msg] @dataclass(frozen=True) @@ -155,7 +155,7 @@ class Poll(Generic[Msg]): Sub = Join[Msg] | Poll[Msg] -class RunTime(Generic[Model, Msg, Flags, Out]): +class RunTime(Generic[Model, Msg, Flags, Output]): """RunTime object. The RunTime is an intermediate data structure which manages communication between the platform @@ -167,17 +167,17 @@ class RunTime(Generic[Model, Msg, Flags, Out]): def __init__( self, - init: Callable[[Flags], tuple[Model, Cmd[Msg, Out]]], + init: Callable[[Flags], tuple[Model, Cmd[Msg, Output]]], process: Callable[[Model, bytes], Msg], subscription: Callable[[Model], list[Sub[Msg]]], - update: Callable[[Model, Msg], tuple[Model, Cmd[Msg, Out]]], + update: Callable[[Model, Msg], tuple[Model, Cmd[Msg, Output]]], ) -> None: self.init = init self.process = process self.subscription = subscription self.update = update - self._next_step: Callable[[Out], None] | None = None + self._next_step: Callable[[Output], None] | None = None self._model: Model | None = None self._subscriptions: dict[str, Sub[Msg]] = {} @@ -187,13 +187,13 @@ def model(self) -> Model: return self._model @property - def next_step(self) -> Callable[[Out], None]: + def next_step(self) -> Callable[[Output], None]: assert self._next_step is not None return self._next_step # NOTE: Could this be a factory function that produces RunTimes instead? That way we don't need # the assert checks on model and commit. - def setup(self, flags: Flags, next_step: Callable[[Out], None]) -> None: + def setup(self, flags: Flags, next_step: Callable[[Output], None]) -> None: self._next_step = next_step model, cmd = self.init(flags) @@ -229,7 +229,7 @@ def _handle_msg(self, msg: Msg) -> None: self._model = model self._handle_cmd(cmd) - def _handle_cmd(self, cmd: Cmd[Msg, Out]) -> None: + def _handle_cmd(self, cmd: Cmd[Msg, Output]) -> None: match cmd: case NextStep(msg=msg, value=value): self.next_step(value) diff --git a/tests/sentry/replays/unit/consumers/test_helpers.py b/tests/sentry/replays/unit/consumers/test_helpers.py index 9d7743f9ab50a2..4ec8fc240f064f 100644 --- a/tests/sentry/replays/unit/consumers/test_helpers.py +++ b/tests/sentry/replays/unit/consumers/test_helpers.py @@ -1,7 +1,7 @@ -from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, Out, RunTime +from sentry.replays.consumers.buffered.platform import Flags, Model, Msg, Output, RunTime -class MockRunTime(RunTime[Model, Msg, Flags, Out]): +class MockRunTime(RunTime[Model, Msg, Flags, Output]): def _handle_msg(self, msg): while True: model, cmd = self.update(self.model, msg) From 895185b9913cde30ab449403003843d2fe5269cd Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Mon, 3 Mar 2025 16:26:36 -0600 Subject: [PATCH 49/49] Docstrings and renames --- src/sentry/replays/consumers/buffered/consumer.py | 11 ++++++++--- src/sentry/replays/consumers/buffered/platform.py | 10 +++------- tests/sentry/replays/unit/consumers/test_recording.py | 4 ++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/sentry/replays/consumers/buffered/consumer.py b/src/sentry/replays/consumers/buffered/consumer.py index ed93363489b403..5ad55b55631ccc 100644 --- a/src/sentry/replays/consumers/buffered/consumer.py +++ b/src/sentry/replays/consumers/buffered/consumer.py @@ -1,4 +1,9 @@ -"""Session Replay recording consumer implementation.""" +"""Session Replay recording consumer implementation. + +The consumer implementation follows a batching flush strategy. We accept messages, process them, +buffer them, and when some threshold is reached we flush the buffer. The batch has finished work +after the buffer is flushed so we commit with a None value. +""" import contextlib import time @@ -10,9 +15,9 @@ from sentry.replays.consumers.buffered.platform import ( Cmd, + Commit, Effect, Join, - NextStep, Nothing, Poll, RunTime, @@ -122,7 +127,7 @@ def update(model: Model, msg: Msg) -> tuple[Model, Cmd[Msg, None]]: case Flushed(now=now): model.buffer = [] model.last_flushed_at = now - return (model, NextStep(msg=Committed(), value=None)) + return (model, Commit(msg=Committed(), value=None)) case TryFlush(now=now): return (model, Task(msg=Flush())) if can_flush(model, now) else (model, Nothing()) diff --git a/src/sentry/replays/consumers/buffered/platform.py b/src/sentry/replays/consumers/buffered/platform.py index 6639cade87da25..bf07962b3532d9 100644 --- a/src/sentry/replays/consumers/buffered/platform.py +++ b/src/sentry/replays/consumers/buffered/platform.py @@ -28,10 +28,6 @@ def __init__( self.__offsets: MutableMapping[Partition, int] = {} self.__runtime = runtime - # NOTE: Filtered payloads update the offsets but are not forwarded to the next step. My - # concern is that the filtered payload could have its offsets committed before the - # preceeding messages have their offsets committed. This is against what the Arroyo library - # does which is to forward everything. def submit(self, message: Message[FilteredPayload | KafkaPayload]) -> None: assert not self.__closed @@ -85,7 +81,7 @@ def _handle_next_step(self, value: Output) -> None: @dataclass(frozen=True) -class NextStep(Generic[Msg, Output]): +class Commit(Generic[Msg, Output]): """Instructs the RunTime to produce to the next step.""" msg: Msg @@ -120,7 +116,7 @@ class Task(Generic[Msg]): # A "Cmd" is the union of all the commands an application can issue to the RunTime. The RunTime # accepts these commands and handles them in some pre-defined way. Commands are fixed and can not # be registered on a per application basis. -Cmd = NextStep[Msg, Output] | Effect[Msg] | Nothing | Task[Msg] +Cmd = Commit[Msg, Output] | Effect[Msg] | Nothing | Task[Msg] @dataclass(frozen=True) @@ -231,7 +227,7 @@ def _handle_msg(self, msg: Msg) -> None: def _handle_cmd(self, cmd: Cmd[Msg, Output]) -> None: match cmd: - case NextStep(msg=msg, value=value): + case Commit(msg=msg, value=value): self.next_step(value) return self._handle_msg(msg) case Effect(msg=msg, fun=fun): diff --git a/tests/sentry/replays/unit/consumers/test_recording.py b/tests/sentry/replays/unit/consumers/test_recording.py index 32d5b847d6a3e8..c58f3627044e41 100644 --- a/tests/sentry/replays/unit/consumers/test_recording.py +++ b/tests/sentry/replays/unit/consumers/test_recording.py @@ -15,7 +15,7 @@ subscription, update, ) -from sentry.replays.consumers.buffered.platform import Effect, NextStep, Nothing, Task +from sentry.replays.consumers.buffered.platform import Commit, Effect, Nothing, Task from sentry.replays.usecases.ingest import ProcessedRecordingMessage from sentry.replays.usecases.ingest.event_parser import ParsedEventMeta from tests.sentry.replays.unit.consumers.test_helpers import MockNextStep, MockRunTime @@ -92,7 +92,7 @@ def test_end_to_end_message_processing(): cmd = gen.send(cmd.msg(1)) assert len(runtime.model.buffer) == 0 assert runtime.model.last_flushed_at == 1 - assert isinstance(cmd, NextStep) + assert isinstance(cmd, Commit) assert isinstance(cmd.msg, Committed)