diff --git a/test/regression/CMakeLists.txt b/test/regression/CMakeLists.txt index 3fa7e3c28865..0f193580ac48 100644 --- a/test/regression/CMakeLists.txt +++ b/test/regression/CMakeLists.txt @@ -47,6 +47,7 @@ if (TILEDB_CPP_API) list(APPEND SOURCES targets/sc-25116.cc) list(APPEND SOURCES targets/sc-29682.cc) list(APPEND SOURCES targets/sc-33480.cc) + list(APPEND SOURCES targets/sc-35424.cc) endif() add_executable(tiledb_regression diff --git a/test/regression/targets/sc-35424.cc b/test/regression/targets/sc-35424.cc new file mode 100644 index 000000000000..66b144363f18 --- /dev/null +++ b/test/regression/targets/sc-35424.cc @@ -0,0 +1,165 @@ +#include +#include +#include + +#include +#include + +#include + +using namespace tiledb; + +static void create_array(const std::string& array_uri); +static void write_first_fragment(const std::string& array_uri); +static uint64_t time_travel_destination(); +static void add_attr_b(const std::string& array_uri); +static void write_second_fragment(const std::string& array_uri); + +static void read_without_time_travel(const std::string& array_uri); +static void read_with_time_travel(const std::string& array_uri, uint64_t when); + +TEST_CASE( + "Use the correct schema when time traveling", + "[time-traveling][array-schema][bug][sc35424]") { + std::string array_uri = "test_time_traveling_schema"; + + // Test setup + create_array(array_uri); + write_first_fragment(array_uri); + auto timepoint = time_travel_destination(); + add_attr_b(array_uri); + write_second_fragment(array_uri); + + // Check reads with and without time travel. + read_without_time_travel(array_uri); + read_with_time_travel(array_uri, timepoint); +} + +void create_array(const std::string& array_uri) { + Context ctx; + + auto obj = Object::object(ctx, array_uri); + if (obj.type() != Object::Type::Invalid) { + Object::remove(ctx, array_uri); + } + + auto dim = Dimension::create(ctx, "d", {{0, 1024}}); + + Domain dom(ctx); + dom.add_dimension(dim); + + auto attr = Attribute::create(ctx, "a"); + + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_order({{TILEDB_ROW_MAJOR, TILEDB_ROW_MAJOR}}) + .set_domain(dom) + .add_attribute(attr); + + Array::create(array_uri, schema); +} + +void write_first_fragment(const std::string& array_uri) { + std::vector d_data = {0, 1, 2, 3, 4}; + std::vector a_data = {5, 6, 7, 8, 9}; + + Context ctx; + Array array(ctx, array_uri, TILEDB_WRITE); + Query query(ctx, array, TILEDB_WRITE); + query.set_layout(TILEDB_UNORDERED) + .set_data_buffer("d", d_data) + .set_data_buffer("a", a_data); + REQUIRE(query.submit() == Query::Status::COMPLETE); + array.close(); +} + +uint64_t time_travel_destination() { + // We sleep for 5ms to ensure that our fragments are separated in time + // and allowing us to grab a time guaranteed to be between them. + auto delay = std::chrono::milliseconds(5); + std::this_thread::sleep_for(delay); + + auto timepoint = tiledb_timestamp_now_ms(); + + std::this_thread::sleep_for(delay); + + return timepoint; +} + +void add_attr_b(const std::string& array_uri) { + Context ctx; + auto attr = Attribute::create(ctx, "b"); + + ArraySchemaEvolution ase(ctx); + ase.add_attribute(attr); + ase.array_evolve(array_uri); +} + +void write_second_fragment(const std::string& array_uri) { + std::vector d_data = {5, 6, 7, 8, 9}; + std::vector a_data = {10, 11, 12, 13, 14}; + std::vector b_data = {15, 16, 17, 18, 19}; + + Context ctx; + Array array(ctx, array_uri, TILEDB_WRITE); + Query query(ctx, array, TILEDB_WRITE); + query.set_layout(TILEDB_UNORDERED) + .set_data_buffer("d", d_data) + .set_data_buffer("a", a_data) + .set_data_buffer("b", b_data); + REQUIRE(query.submit() == Query::Status::COMPLETE); + array.close(); +} + +void read_without_time_travel(const std::string& array_uri) { + std::vector d_data(10); + std::vector a_data(10); + std::vector b_data(10); + + Context ctx; + Array array(ctx, array_uri, TILEDB_READ); + Query query(ctx, array, TILEDB_READ); + query.set_data_buffer("d", d_data) + .set_data_buffer("a", a_data) + .set_data_buffer("b", b_data); + + REQUIRE(query.submit() == Query::Status::COMPLETE); + + for (int32_t i = 0; i < 10; i++) { + REQUIRE(d_data[i] == i); + REQUIRE(a_data[i] == i + 5); + + if (i < 5) { + REQUIRE(b_data[i] == INT32_MIN); + } else { + REQUIRE(b_data[i] == i + 10); + } + } +} + +void read_with_time_travel(const std::string& array_uri, uint64_t when) { + std::vector d_data(10, INT_MAX); + std::vector a_data(10, INT_MAX); + std::vector b_data(10, INT_MAX); + + Context ctx; + Array array(ctx, array_uri, TILEDB_READ, TemporalPolicy(TimeTravel, when)); + Query query(ctx, array, TILEDB_READ); + query.set_data_buffer("d", d_data).set_data_buffer("a", a_data); + + auto matcher = Catch::Matchers::ContainsSubstring("There is no field b"); + REQUIRE_THROWS_WITH(query.set_data_buffer("b", b_data), matcher); + + REQUIRE(query.submit() == Query::Status::COMPLETE); + + for (int32_t i = 0; i < 10; i++) { + if (i < 5) { + REQUIRE(d_data[i] == i); + REQUIRE(a_data[i] == i + 5); + REQUIRE(b_data[i] == INT_MAX); + } else { + REQUIRE(d_data[i] == INT_MAX); + REQUIRE(a_data[i] == INT_MAX); + REQUIRE(b_data[i] == INT_MAX); + } + } +} diff --git a/test/src/unit-capi-array_schema.cc b/test/src/unit-capi-array_schema.cc index 767115574533..b700f83182e7 100644 --- a/test/src/unit-capi-array_schema.cc +++ b/test/src/unit-capi-array_schema.cc @@ -2149,6 +2149,8 @@ TEST_CASE_METHOD( tiledb_array_t* array; rc = tiledb_array_alloc(ctx_, array_name.c_str(), &array); REQUIRE(rc == TILEDB_OK); + rc = tiledb_array_set_open_timestamp_end(ctx_, array, now + 1); + REQUIRE(rc == TILEDB_OK); rc = tiledb_array_open(ctx_, array, TILEDB_READ); REQUIRE(rc == TILEDB_OK); tiledb_array_schema_t* read_schema; @@ -2292,6 +2294,8 @@ TEST_CASE_METHOD( tiledb_array_t* array; rc = tiledb_array_alloc(ctx_, array_name.c_str(), &array); REQUIRE(rc == TILEDB_OK); + rc = tiledb_array_set_open_timestamp_end(ctx_, array, now + 1); + REQUIRE(rc == TILEDB_OK); rc = tiledb_array_open(ctx_, array, TILEDB_READ); REQUIRE(rc == TILEDB_OK); tiledb_array_schema_t* read_schema; diff --git a/tiledb/sm/array/array_directory.cc b/tiledb/sm/array/array_directory.cc index 5a223fd25c75..86fa172a307c 100644 --- a/tiledb/sm/array/array_directory.cc +++ b/tiledb/sm/array/array_directory.cc @@ -393,8 +393,7 @@ Status ArrayDirectory::load() { "Cannot open array; Array does not exist.")); } - // Set the latest array schema URI - latest_array_schema_uri_ = array_schema_uris_.back(); + latest_array_schema_uri_ = select_latest_array_schema_uri(); assert(!latest_array_schema_uri_.is_invalid()); } @@ -1214,6 +1213,44 @@ Status ArrayDirectory::compute_array_schema_uris( return Status::Ok(); } +URI ArrayDirectory::select_latest_array_schema_uri() { + // Set the latest array schema URI. When in READ mode, the latest array + // schema URI is the schema with the largest timestamp less than or equal + // to the current timestamp_end_. If no schema meets this definition, we + // use the first schema available. + // + // The reason for choosing the oldest array schema URI even when time + // traveling before it existed is to first, not break any arrays that have + // fragments written before the first schema existed. The second reason is + // to not break old arrays that only have the old `__array_schema.tdb` + // URI which does not have timestamps. + if (mode_ != ArrayDirectoryMode::READ) { + return array_schema_uris_.back(); + } + + optional latest_uri = nullopt; + uint64_t latest_ts = 0; + + for (auto& uri : array_schema_uris_) { + auto name = uri.remove_trailing_slash().last_path_part(); + + // Skip the old schema URI name since it doesn't have timestamps + if (name == constants::array_schema_filename) { + continue; + } + + std::pair ts_range; + throw_if_not_ok(utils::parse::get_timestamp_range(uri, &ts_range)); + + if (ts_range.second > latest_ts && ts_range.second <= timestamp_end_) { + latest_uri = uri; + latest_ts = ts_range.second; + } + } + + return latest_uri.value_or(array_schema_uris_.front()); +} + bool ArrayDirectory::is_vacuum_file(const URI& uri) const { if (utils::parse::ends_with(uri.to_string(), constants::vacuum_file_suffix)) return true; diff --git a/tiledb/sm/array/array_directory.h b/tiledb/sm/array/array_directory.h index ca8faf79ffcb..0d17b64af023 100644 --- a/tiledb/sm/array/array_directory.h +++ b/tiledb/sm/array/array_directory.h @@ -764,6 +764,13 @@ class ArrayDirectory { Status compute_array_schema_uris( const std::vector& array_schema_dir_uris); + /** + * Select the URI to use for the latest array schema. + * + * @return URI The latest array schema URI to use. + */ + URI select_latest_array_schema_uri(); + /** * Checks if a fragment overlaps with the array directory timestamp * range. Overlap is partial or full depending on the consolidation