redpanda-data · mmaslankaprv · Feb 4, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
@@ -3792,6 +3792,25 @@ configuration::configuration()
       {.visibility = visibility::user},
       std::nullopt,
       &validate_non_empty_string_opt)
+  , iceberg_backlog_controller_p_coeff(
+      *this,
+      "iceberg_backlog_controller_p_coeff",
+      "Proportional coefficient for the Iceberg backlog controller. Number of "
+      "shares assigned to the datalake scheduling group will be proportional "
+      "to the backlog size error. More negative value means larger and faster "
+      "changes in the number of shares in the datalake scheduling group.",
+      {.needs_restart = needs_restart::no, .visibility = visibility::tunable},
+      -0.0003)
+  , iceberg_target_backlog_size(
+      *this,
+      "iceberg_target_backlog_size",
+      "Average size of the datalake translation backlog, per partition, that "
+      "the backlog controller will try to maintain. When a backlog size is "
+      "larger than the setpoint a backlog controller will increase the "
+      "translation scheduling group priority.",
+      {.needs_restart = needs_restart::no, .visibility = visibility::tunable},
+      5_MiB,
+      {.min = 0, .max = std::numeric_limits<uint32_t>::max()})
   , iceberg_delete(
       *this,
       "iceberg_delete",

@@ -718,6 +718,8 @@ struct configuration final : public config_store {
     property<std::optional<ss::sstring>> iceberg_rest_catalog_trust_file;
     property<std::optional<ss::sstring>> iceberg_rest_catalog_crl_file;
     property<std::optional<ss::sstring>> iceberg_rest_catalog_prefix;
+    property<double> iceberg_backlog_controller_p_coeff;
+    bounded_property<uint32_t> iceberg_target_backlog_size;
 
     property<bool> iceberg_delete;
     property<ss::sstring> iceberg_default_partition_spec;

diff --git a/src/v/datalake/BUILD b/src/v/datalake/BUILD
@@ -175,6 +175,7 @@ redpanda_cc_library(
     include_prefix = "datalake",
     visibility = ["//visibility:public"],
     deps = [
+        ":backlog_controller",
         ":catalog_schema_manager",
         ":cloud_data_io",
         ":location",
@@ -602,3 +603,23 @@ redpanda_cc_library(
         "//src/v/model",
     ],
 )
+
+redpanda_cc_library(
+    name = "backlog_controller",
+    srcs = [
+        "backlog_controller.cc",
+    ],
+    hdrs = [
+        "backlog_controller.h",
+    ],
+    include_prefix = "datalake",
+    visibility = [":__subpackages__"],
+    deps = [
+        ":logger",
+        "//src/v/base",
+        "//src/v/config",
+        "//src/v/metrics",
+        "//src/v/model",
+        "@seastar",
+    ],
+)
diff --git a/src/v/datalake/CMakeLists.txt b/src/v/datalake/CMakeLists.txt
@@ -15,6 +15,7 @@ v_cc_library(
   NAME datalake_manager
   SRCS
     datalake_manager.cc
+    backlog_controller.cc
   DEPS
     v::datalake_translation
     v::datalake_coordinator

diff --git a/src/v/datalake/backlog_controller.cc b/src/v/datalake/backlog_controller.cc
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2025 Redpanda Data, Inc.
+ *
+ * Licensed as a Redpanda Enterprise file under the Redpanda Community
+ * License (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
+ */
+#include "datalake/backlog_controller.h"
+
+#include "base/vlog.h"
+#include "config/configuration.h"
+#include "datalake/logger.h"
+#include "metrics/prometheus_sanitize.h"
+
+#include <seastar/core/coroutine.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/reactor.hh>
+using namespace std::chrono_literals; // NOLINT
+
+namespace datalake {
+
+backlog_controller::backlog_controller(
+  sampling_fn sampling_fn, ss::scheduling_group sg)
+  : _sampling_f(std::move(sampling_fn))
+  , _scheduling_group(sg)
+  , _proportional_coeff(
+      config::shard_local_cfg().iceberg_backlog_controller_p_coeff.bind())
+  , _setpoint(config::shard_local_cfg().iceberg_target_backlog_size.bind())
+  , _sampling_interval(5s) {}
+
+ss::future<> backlog_controller::start() {
+    setup_metrics();
+    _sampling_timer.set_callback([this] {
+        update();
+        if (!_as.abort_requested()) {
+            _sampling_timer.arm(_sampling_interval);
+        }
+    });
+
+    _sampling_timer.arm(_sampling_interval);
+    co_return;
+}
+
+ss::future<> backlog_controller::stop() {
+    _sampling_timer.cancel();
+    _as.request_abort();
+    co_return;
+}
+
+void backlog_controller::update() {
+    using namespace std::chrono_literals;
+
+    _current_sample = _sampling_f();
+
+    auto current_err = _setpoint() - _current_sample;
+    auto update = _proportional_coeff() * current_err;
+
+    update = std::clamp(static_cast<int>(update), _min_shares, _max_shares);
+
+    vlog(
+      datalake_log.trace,
+      "state update: {{setpoint: {}, current_backlog: {:2f}, current_error: "
+      "{:2f}, shares_update: {:2f}}}",
+      _setpoint(),
+      _current_sample,
+      current_err,
+      update);
+
+    _scheduling_group.set_shares(static_cast<float>(update));
+}
+
+void backlog_controller::setup_metrics() {
+    if (config::shard_local_cfg().disable_metrics()) {
+        return;
+    }
+    namespace sm = ss::metrics;
+    _metrics.add_group(
+      prometheus_sanitize::metrics_name("iceberg:backlog:controller"),
+      {
+        sm::make_gauge(
+          "backlog_size",
+          [this] { return _current_sample; },
+          sm::description("Iceberg controller current backlog - averaged size "
+                          "of the backlog per partition")),
+
+      });
+}
+
+} // namespace datalake
diff --git a/src/v/datalake/backlog_controller.h b/src/v/datalake/backlog_controller.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2025 Redpanda Data, Inc.
+ *
+ * Licensed as a Redpanda Enterprise file under the Redpanda Community
+ * License (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
+ */
+
+#pragma once
+#include "base/seastarx.h"
+#include "config/property.h"
+#include "metrics/metrics.h"
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/io_priority_class.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/util/log.hh>
+
+namespace datalake {
+/**
+ * Class responsible for adjusting shares to maintain a target translation
+ * backlog size. This simple proportional controller uses a sampling function to
+ * collect the size of the backlog and adjust the shares of the datalake
+ * scheduling group to keep the average backlog size close to the setpoint.
+ */
+class backlog_controller {
+public:
+    using sampling_fn = ss::noncopyable_function<long double()>;
+    backlog_controller(sampling_fn, ss::scheduling_group sg);
+
+    ss::future<> start();
+    ss::future<> stop();
+
+private:
+    void update();
+    void setup_metrics();
+
+    sampling_fn _sampling_f;
+    ss::scheduling_group _scheduling_group;
+    /**
+     * Proportionality coefficient for the controller, this value controls how
+     * eager the controller is to adjust the shares of the datalake scheduling
+     * group.
+     */
+    config::binding<double> _proportional_coeff;
+    /**
+     * The target backlog size for the controller to keep. The controller will
+     * adjust the scheduling group shares keep the average backlog size close to
+     * this value.
+     */
+    config::binding<uint32_t> _setpoint;
+    std::chrono::steady_clock::duration _sampling_interval;
+    ss::timer<> _sampling_timer;
+    // Current value of the backlog (currently it is average size of data to
+    // translate for datalake enabled partitions)
+    long double _current_sample{0};
+    /**
+     * Limits for controlled scheduling group shares.
+     */
+    int _min_shares{1};
+    int _max_shares{1000};
+    ss::abort_source _as;
+    metrics::internal_metric_groups _metrics;
+};
+} // namespace datalake
diff --git a/src/v/datalake/datalake_manager.cc b/src/v/datalake/datalake_manager.cc
@@ -14,6 +14,7 @@
 #include "cluster/topic_table.h"
 #include "cluster/types.h"
 #include "config/configuration.h"
+#include "datalake/backlog_controller.h"
 #include "datalake/catalog_schema_manager.h"
 #include "datalake/cloud_data_io.h"
 #include "datalake/coordinator/catalog_factory.h"
@@ -120,6 +121,26 @@ datalake_manager::datalake_manager(
 }
 datalake_manager::~datalake_manager() = default;
 
+double datalake_manager::average_translation_backlog() {
+    size_t total_lag = 0;
+    size_t translators_with_backlog = 0;
+    for (const auto& [_, translator] : _translators) {
+        auto backlog_size = translator->translation_backlog();
+        // skip over translators that are not yet ready to report anything
+        if (!backlog_size) {
+            continue;
+        }
+        total_lag += backlog_size.value();
+        translators_with_backlog++;
+    }
+
+    if (translators_with_backlog == 0) {
+        return 0;
+    }
+
+    return total_lag / translators_with_backlog;
+}
+
 ss::future<> datalake_manager::start() {
     _catalog = co_await _catalog_factory->create_catalog();
     _schema_mgr = std::make_unique<catalog_schema_manager>(*_catalog);
@@ -201,10 +222,14 @@ ss::future<> datalake_manager::start() {
         }
     });
     _schema_cache->start();
+    _backlog_controller = std::make_unique<backlog_controller>(
+      [this] { return average_translation_backlog(); }, _sg);
+    co_await _backlog_controller->start();
 }
 
 ss::future<> datalake_manager::stop() {
     auto f = _gate.close();
+    co_await _backlog_controller->stop();
     _deregistrations.clear();
     co_await ss::max_concurrent_for_each(
       _translators, 32, [](auto& entry) mutable {

diff --git a/src/v/datalake/datalake_manager.h b/src/v/datalake/datalake_manager.h
@@ -15,6 +15,7 @@
 #include "cluster/fwd.h"
 #include "config/property.h"
 #include "container/chunked_hash_map.h"
+#include "datalake/backlog_controller.h"
 #include "datalake/fwd.h"
 #include "datalake/location.h"
 #include "datalake/record_schema_resolver.h"
@@ -81,7 +82,7 @@ class datalake_manager : public ss::peering_sharded_service<datalake_manager> {
       model::iceberg_mode,
       model::iceberg_invalid_record_action);
     void stop_translator(const model::ntp&);
-
+    double average_translation_backlog();
     model::node_id _self;
     ss::sharded<raft::group_manager>* _group_mgr;
     ss::sharded<cluster::partition_manager>* _partition_mgr;
@@ -99,6 +100,7 @@ class datalake_manager : public ss::peering_sharded_service<datalake_manager> {
     std::unique_ptr<datalake::schema_manager> _schema_mgr;
     std::unique_ptr<datalake::type_resolver> _type_resolver;
     std::unique_ptr<datalake::schema_cache> _schema_cache;
+    std::unique_ptr<backlog_controller> _backlog_controller;
     ss::sharded<ss::abort_source>* _as;
     ss::scheduling_group _sg;
     ss::gate _gate;

diff --git a/src/v/datalake/translation/partition_translator.cc b/src/v/datalake/translation/partition_translator.cc
@@ -340,6 +340,32 @@ partition_translator::do_translation_for_range(
     co_return std::move(result.value());
 }
 
+std::optional<size_t> partition_translator::translation_backlog() const {
+    if (!_last_translated_log_offset.has_value()) {
+        return std::nullopt;
+    }
+
+    auto size_after_translated = _partition->log()->size_bytes_after_offset(
+      _last_translated_log_offset.value());
+
+    auto size_after_max_translatable
+      = _partition->log()->size_bytes_after_offset(
+        _partition->last_stable_offset());
+
+    if (size_after_translated < size_after_max_translatable) {
+        vlog(
+          _logger.error,
+          "Expected size after translated offset({}) {} to be greater than or "
+          "equal to the size of log after max translatable offset({}): {}",
+          _last_translated_log_offset,
+          size_after_translated,
+          _partition->last_stable_offset());
+        return std::nullopt;
+    }
+
+    return size_after_translated - size_after_max_translatable;
+}
+
 ss::future<partition_translator::translation_success>
 partition_translator::do_translate_once(retry_chain_node& parent_rcn) {
     if (
@@ -399,6 +425,9 @@ partition_translator::do_translate_once(retry_chain_node& parent_rcn) {
               read_begin_offset,
               std::move(translation_result.value()))) {
             max_translated_offset = last_translated_offset;
+            _last_translated_log_offset
+              = _partition->get_offset_translator_state()->to_log_offset(
+                kafka::offset_cast(max_translated_offset));
             result = translation_success::yes;
         }
     }

diff --git a/src/v/datalake/translation/partition_translator.h b/src/v/datalake/translation/partition_translator.h
@@ -91,6 +91,11 @@ class partition_translator {
 
     std::chrono::milliseconds translation_interval() const;
     void reset_translation_interval(std::chrono::milliseconds new_base);
+    /**
+     * Returns the number of bytes that are ready to be translated.
+     * If no translation happened yet, this will return a negative value.
+     */
+    std::optional<size_t> translation_backlog() const;
 
     model::iceberg_invalid_record_action invalid_record_action() const;
     void reset_invalid_record_action(
@@ -140,6 +145,10 @@ class partition_translator {
     std::unique_ptr<record_translator> _record_translator;
     std::unique_ptr<table_creator> _table_creator;
     std::unique_ptr<kafka::partition_proxy> _partition_proxy;
+    /**
+     * The offset is tracked for calculating the translation backlog.
+     * */
+    std::optional<model::offset> _last_translated_log_offset;
     using jitter_t
       = simple_time_jitter<ss::lowres_clock, std::chrono::milliseconds>;
     jitter_t _jitter;