diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index ada9b473a..141b74537 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -25,6 +25,7 @@ set(ICEBERG_SOURCES data/position_delete_writer.cc data/writer.cc delete_file_index.cc + deletes/roaring_position_bitmap.cc expression/aggregate.cc expression/binder.cc expression/evaluator.cc @@ -165,6 +166,7 @@ iceberg_install_all_headers(iceberg) add_subdirectory(catalog) add_subdirectory(data) +add_subdirectory(deletes) add_subdirectory(expression) add_subdirectory(manifest) add_subdirectory(row) diff --git a/src/iceberg/deletes/CMakeLists.txt b/src/iceberg/deletes/CMakeLists.txt new file mode 100644 index 000000000..2ce7ccf15 --- /dev/null +++ b/src/iceberg/deletes/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +iceberg_install_all_headers(iceberg/deletes) diff --git a/src/iceberg/deletes/meson.build b/src/iceberg/deletes/meson.build new file mode 100644 index 000000000..28a01de16 --- /dev/null +++ b/src/iceberg/deletes/meson.build @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers(['roaring_position_bitmap.h'], subdir: 'iceberg/deletes') diff --git a/src/iceberg/deletes/roaring_position_bitmap.cc b/src/iceberg/deletes/roaring_position_bitmap.cc new file mode 100644 index 000000000..2831c6181 --- /dev/null +++ b/src/iceberg/deletes/roaring_position_bitmap.cc @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/deletes/roaring_position_bitmap.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "roaring/roaring.hh" + +namespace iceberg { + +namespace { + +constexpr size_t kBitmapCountSizeBytes = 8; +constexpr size_t kBitmapKeySizeBytes = 4; + +// Extracts high 32 bits from a 64-bit position (the key). +int32_t Key(int64_t pos) { return static_cast(pos >> 32); } + +// Extracts low 32 bits from a 64-bit position. +uint32_t Pos32Bits(int64_t pos) { return static_cast(pos); } + +// Combines key (high 32 bits) and pos32 (low 32 bits) into a 64-bit +// position. The low 32 bits are zero-extended to avoid sign extension. +int64_t ToPosition(int32_t key, uint32_t pos32) { + return (static_cast(key) << 32) | static_cast(pos32); +} + +void WriteLE64(char* buf, int64_t value) { + auto v = static_cast(value); + for (int i = 0; i < 8; ++i) { + buf[i] = static_cast((v >> (i * 8)) & 0xFF); + } +} + +void WriteLE32(char* buf, int32_t value) { + auto v = static_cast(value); + for (int i = 0; i < 4; ++i) { + buf[i] = static_cast((v >> (i * 8)) & 0xFF); + } +} + +int64_t ReadLE64(const char* buf) { + auto b = reinterpret_cast(buf); + uint64_t v = 0; + for (int i = 0; i < 8; ++i) { + v |= static_cast(b[i]) << (i * 8); + } + return static_cast(v); +} + +int32_t ReadLE32(const char* buf) { + auto b = reinterpret_cast(buf); + uint32_t v = 0; + for (int i = 0; i < 4; ++i) { + v |= static_cast(b[i]) << (i * 8); + } + return static_cast(v); +} + +Status ValidatePosition(int64_t pos) { + if (pos < 0 || pos > RoaringPositionBitmap::kMaxPosition) { + return InvalidArgument("Bitmap supports positions that are >= 0 and <= {}: {}", + RoaringPositionBitmap::kMaxPosition, pos); + } + return {}; +} + +} // namespace + +struct RoaringPositionBitmap::Impl { + std::vector bitmaps; + + void AllocateBitmapsIfNeeded(int32_t required_length) { + if (static_cast(bitmaps.size()) < required_length) { + bitmaps.resize(static_cast(required_length)); + } + } +}; + +RoaringPositionBitmap::RoaringPositionBitmap() : impl_(std::make_unique()) {} + +RoaringPositionBitmap::~RoaringPositionBitmap() = default; + +RoaringPositionBitmap::RoaringPositionBitmap(RoaringPositionBitmap&&) noexcept = default; + +RoaringPositionBitmap& RoaringPositionBitmap::operator=( + RoaringPositionBitmap&&) noexcept = default; + +RoaringPositionBitmap::RoaringPositionBitmap(std::unique_ptr impl) + : impl_(std::move(impl)) {} + +Status RoaringPositionBitmap::Add(int64_t pos) { + if (auto status = ValidatePosition(pos); !status) { + return status; + } + int32_t key = Key(pos); + uint32_t pos32 = Pos32Bits(pos); + impl_->AllocateBitmapsIfNeeded(key + 1); + impl_->bitmaps[key].add(pos32); + return {}; +} + +Status RoaringPositionBitmap::AddRange(int64_t pos_start, int64_t pos_end) { + for (int64_t pos = pos_start; pos < pos_end; ++pos) { + if (auto status = Add(pos); !status) { + return status; + } + } + return {}; +} + +Result RoaringPositionBitmap::Contains(int64_t pos) const { + if (auto status = ValidatePosition(pos); !status) { + return std::unexpected(status.error()); + } + int32_t key = Key(pos); + uint32_t pos32 = Pos32Bits(pos); + return key < static_cast(impl_->bitmaps.size()) && + impl_->bitmaps[key].contains(pos32); +} + +bool RoaringPositionBitmap::IsEmpty() const { return Cardinality() == 0; } + +int64_t RoaringPositionBitmap::Cardinality() const { + int64_t total = 0; + for (const auto& bitmap : impl_->bitmaps) { + total += static_cast(bitmap.cardinality()); + } + return total; +} + +void RoaringPositionBitmap::Or(const RoaringPositionBitmap& other) { + impl_->AllocateBitmapsIfNeeded(static_cast(other.impl_->bitmaps.size())); + for (size_t key = 0; key < other.impl_->bitmaps.size(); ++key) { + impl_->bitmaps[key] |= other.impl_->bitmaps[key]; + } +} + +bool RoaringPositionBitmap::RunLengthEncode() { + bool changed = false; + for (auto& bitmap : impl_->bitmaps) { + changed |= bitmap.runOptimize(); + } + return changed; +} + +void RoaringPositionBitmap::ForEach(const std::function& fn) const { + for (size_t key = 0; key < impl_->bitmaps.size(); ++key) { + for (uint32_t pos32 : impl_->bitmaps[key]) { + fn(ToPosition(static_cast(key), pos32)); + } + } +} + +int64_t RoaringPositionBitmap::SerializedSizeInBytes() const { + int64_t size = static_cast(kBitmapCountSizeBytes); + for (const auto& bitmap : impl_->bitmaps) { + size += static_cast(kBitmapKeySizeBytes) + + static_cast(bitmap.getSizeInBytes(/*portable=*/true)); + } + return size; +} + +Result RoaringPositionBitmap::Serialize() const { + int64_t size = SerializedSizeInBytes(); + std::string result(static_cast(size), '\0'); + char* buf = result.data(); + + // Write bitmap count (array length including empties) + WriteLE64(buf, static_cast(impl_->bitmaps.size())); + buf += kBitmapCountSizeBytes; + + // Write each bitmap with its key + for (int32_t key = 0; key < static_cast(impl_->bitmaps.size()); ++key) { + WriteLE32(buf, key); + buf += kBitmapKeySizeBytes; + size_t written = impl_->bitmaps[key].write(buf, /*portable=*/true); + buf += written; + } + + return result; +} + +Result RoaringPositionBitmap::Deserialize(std::string_view bytes) { + const char* buf = bytes.data(); + size_t remaining = bytes.size(); + + if (remaining < kBitmapCountSizeBytes) { + return InvalidArgument("Buffer too small for bitmap count"); + } + + int64_t bitmap_count = ReadLE64(buf); + buf += kBitmapCountSizeBytes; + remaining -= kBitmapCountSizeBytes; + + if (bitmap_count < 0 || bitmap_count > std::numeric_limits::max()) { + return InvalidArgument("Invalid bitmap count: {}", bitmap_count); + } + + auto impl = std::make_unique(); + int32_t last_key = -1; + int32_t remaining_count = static_cast(bitmap_count); + + while (remaining_count > 0) { + if (remaining < kBitmapKeySizeBytes) { + return InvalidArgument("Buffer too small for bitmap key"); + } + + int32_t key = ReadLE32(buf); + buf += kBitmapKeySizeBytes; + remaining -= kBitmapKeySizeBytes; + + // Validate key (matches Java's readKey) + if (key < 0) { + return InvalidArgument("Invalid unsigned key: {}", key); + } + if (key > std::numeric_limits::max() - 1) { + return InvalidArgument("Key is too large: {}", key); + } + if (key <= last_key) { + return InvalidArgument("Keys must be sorted in ascending order"); + } + + // Fill gaps with empty bitmaps + while (last_key < key - 1) { + impl->bitmaps.emplace_back(); + ++last_key; + } + + // Read bitmap using portable safe deserialization. + // CRoaring's readSafe may throw on corrupted data. + roaring::Roaring bitmap; + try { + bitmap = roaring::Roaring::readSafe(buf, remaining); + } catch (const std::exception& e) { + return InvalidArgument("Failed to deserialize bitmap at key {}: {}", key, e.what()); + } + size_t bitmap_size = bitmap.getSizeInBytes(/*portable=*/true); + if (bitmap_size > remaining) { + return InvalidArgument("Buffer too small for bitmap data at key {}", key); + } + buf += bitmap_size; + remaining -= bitmap_size; + + impl->bitmaps.push_back(std::move(bitmap)); + last_key = key; + --remaining_count; + } + + return RoaringPositionBitmap(std::move(impl)); +} + +} // namespace iceberg diff --git a/src/iceberg/deletes/roaring_position_bitmap.h b/src/iceberg/deletes/roaring_position_bitmap.h new file mode 100644 index 000000000..b7f0a8afe --- /dev/null +++ b/src/iceberg/deletes/roaring_position_bitmap.h @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/deletes/roaring_position_bitmap.h +/// A 64-bit position bitmap using an array of 32-bit Roaring bitmaps. + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +/// \brief A bitmap that supports positive 64-bit positions, optimized +/// for cases where most positions fit in 32 bits. +/// +/// Incoming 64-bit positions are divided into a 32-bit "key" using the +/// most significant 4 bytes and a 32-bit position using the least +/// significant 4 bytes. For each key, a 32-bit Roaring bitmap is +/// maintained to store positions for that key. +/// +/// This is a faithful C++ port of Java's RoaringPositionBitmap with +/// wire-compatible serialization format. +/// +/// Note: The Puffin deletion-vector-v1 wrapping (length prefix, magic +/// bytes, CRC-32) is handled by the Puffin writer/reader layer, not +/// this class. +class ICEBERG_EXPORT RoaringPositionBitmap { + public: + /// Maximum supported position. + /// Equivalent to Java's + /// toPosition(Integer.MAX_VALUE - 1, Integer.MIN_VALUE). + static constexpr int64_t kMaxPosition = 0x7FFFFFFE80000000LL; + + RoaringPositionBitmap(); + + ~RoaringPositionBitmap(); + + RoaringPositionBitmap(RoaringPositionBitmap&& other) noexcept; + RoaringPositionBitmap& operator=(RoaringPositionBitmap&& other) noexcept; + + RoaringPositionBitmap(const RoaringPositionBitmap&) = delete; + RoaringPositionBitmap& operator=(const RoaringPositionBitmap&) = delete; + + /// \brief Sets a position in the bitmap. + /// \param pos the position (must be >= 0 and <= kMaxPosition) + /// \return Status indicating success or InvalidArgument error + [[nodiscard]] Status Add(int64_t pos); + + /// \brief Sets a range of positions [pos_start, pos_end). + /// \return Status indicating success or InvalidArgument error + [[nodiscard]] Status AddRange(int64_t pos_start, int64_t pos_end); + + /// \brief Checks if a position is set in the bitmap. + /// \return Result or InvalidArgument error + [[nodiscard]] Result Contains(int64_t pos) const; + + /// \brief Returns true if the bitmap has no positions set. + bool IsEmpty() const; + + /// \brief Returns the number of set positions in the bitmap. + int64_t Cardinality() const; + + /// \brief Merges all positions from the other bitmap into this one + /// (in-place union). + void Or(const RoaringPositionBitmap& other); + + /// \brief Applies run-length encoding wherever more space efficient. + /// \return true if the bitmap was changed + bool RunLengthEncode(); + + /// \brief Iterates over all set positions in ascending order. + void ForEach(const std::function& fn) const; + + /// \brief Returns the serialized size in bytes. + int64_t SerializedSizeInBytes() const; + + /// \brief Serializes using the portable format (little-endian). + Result Serialize() const; + + /// \brief Deserializes a bitmap from bytes. + static Result Deserialize(std::string_view bytes); + + private: + struct Impl; + std::unique_ptr impl_; + + explicit RoaringPositionBitmap(std::unique_ptr impl); +}; + +} // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 81af8dc30..4648e6fe3 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -43,6 +43,7 @@ iceberg_sources = files( 'arrow_c_data_guard_internal.cc', 'catalog/memory/in_memory_catalog.cc', 'delete_file_index.cc', + 'deletes/roaring_position_bitmap.cc', 'expression/aggregate.cc', 'expression/binder.cc', 'expression/evaluator.cc', @@ -220,6 +221,7 @@ install_headers( ) subdir('catalog') +subdir('deletes') subdir('expression') subdir('manifest') subdir('row') diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 768e0507e..995579b20 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -124,6 +124,8 @@ add_iceberg_test(util_test add_iceberg_test(roaring_test SOURCES roaring_test.cc) +add_iceberg_test(roaring_position_bitmap_test SOURCES roaring_position_bitmap_test.cc) + if(ICEBERG_BUILD_BUNDLE) add_iceberg_test(avro_test USE_BUNDLE diff --git a/src/iceberg/test/resources/64map32bitvals.bin b/src/iceberg/test/resources/64map32bitvals.bin new file mode 100644 index 000000000..475b89441 Binary files /dev/null and b/src/iceberg/test/resources/64map32bitvals.bin differ diff --git a/src/iceberg/test/resources/64mapempty.bin b/src/iceberg/test/resources/64mapempty.bin new file mode 100644 index 000000000..1b1cb4d44 Binary files /dev/null and b/src/iceberg/test/resources/64mapempty.bin differ diff --git a/src/iceberg/test/resources/64maphighvals.bin b/src/iceberg/test/resources/64maphighvals.bin new file mode 100644 index 000000000..d4312b8d2 Binary files /dev/null and b/src/iceberg/test/resources/64maphighvals.bin differ diff --git a/src/iceberg/test/resources/64mapspreadvals.bin b/src/iceberg/test/resources/64mapspreadvals.bin new file mode 100644 index 000000000..83c72f6ba Binary files /dev/null and b/src/iceberg/test/resources/64mapspreadvals.bin differ diff --git a/src/iceberg/test/roaring_position_bitmap_test.cc b/src/iceberg/test/roaring_position_bitmap_test.cc new file mode 100644 index 000000000..4ecb69607 --- /dev/null +++ b/src/iceberg/test/roaring_position_bitmap_test.cc @@ -0,0 +1,519 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/deletes/roaring_position_bitmap.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "iceberg/test/test_config.h" + +namespace iceberg { + +namespace { + +constexpr int64_t kBitmapSize = 0xFFFFFFFFL; +constexpr int64_t kBitmapOffset = kBitmapSize + 1L; +constexpr int64_t kContainerSize = 0xFFFF; // Character.MAX_VALUE +constexpr int64_t kContainerOffset = kContainerSize + 1L; + +// Helper to construct a position from bitmap index, container index, +// and value (mirrors Java's position() test helper). +int64_t Position(int32_t bitmap_index, int32_t container_index, int64_t value) { + return bitmap_index * kBitmapOffset + container_index * kContainerOffset + value; +} + +std::string ReadTestResource(const std::string& filename) { + std::filesystem::path path = std::filesystem::path(ICEBERG_TEST_RESOURCES) / filename; + std::ifstream file(path, std::ios::binary); + EXPECT_TRUE(file.good()) << "Cannot open: " << path; + return {std::istreambuf_iterator(file), std::istreambuf_iterator()}; +} + +RoaringPositionBitmap RoundTripSerialize(const RoaringPositionBitmap& bitmap) { + auto serialized = bitmap.Serialize(); + EXPECT_TRUE(serialized.has_value()); + auto deserialized = RoaringPositionBitmap::Deserialize(serialized.value()); + EXPECT_TRUE(deserialized.has_value()); + return std::move(deserialized.value()); +} + +void AssertEqualContent(const RoaringPositionBitmap& bitmap, + const std::set& positions) { + ASSERT_EQ(bitmap.Cardinality(), static_cast(positions.size())); + for (int64_t pos : positions) { + auto result = bitmap.Contains(pos); + ASSERT_TRUE(result.has_value()) << "Error for pos: " << pos; + ASSERT_TRUE(result.value()) << "Missing position: " << pos; + } + bitmap.ForEach([&](int64_t pos) { + ASSERT_TRUE(positions.count(pos) > 0) << "Unexpected position: " << pos; + }); +} + +void AssertEqual(RoaringPositionBitmap& bitmap, const std::set& positions) { + AssertEqualContent(bitmap, positions); + + auto copy1 = RoundTripSerialize(bitmap); + AssertEqualContent(copy1, positions); + + bitmap.RunLengthEncode(); + auto copy2 = RoundTripSerialize(bitmap); + AssertEqualContent(copy2, positions); +} + +} // namespace + +// Mirrors Java's testAdd +TEST(RoaringPositionBitmapTest, TestAdd) { + RoaringPositionBitmap bitmap; + + EXPECT_TRUE(bitmap.Add(10L).has_value()); + EXPECT_TRUE(bitmap.Contains(10L).value()); + + EXPECT_TRUE(bitmap.Add(0L).has_value()); + EXPECT_TRUE(bitmap.Contains(0L).value()); + + EXPECT_TRUE(bitmap.Add(10L).has_value()); // duplicate + EXPECT_TRUE(bitmap.Contains(10L).value()); +} + +// Mirrors Java's testAddPositionsRequiringMultipleBitmaps +TEST(RoaringPositionBitmapTest, TestAddPositionsRequiringMultipleBitmaps) { + RoaringPositionBitmap bitmap; + + int64_t pos1 = (static_cast(0) << 32) | 10L; + int64_t pos2 = (static_cast(1) << 32) | 20L; + int64_t pos3 = (static_cast(2) << 32) | 30L; + int64_t pos4 = (static_cast(100) << 32) | 40L; + + EXPECT_TRUE(bitmap.Add(pos1).has_value()); + EXPECT_TRUE(bitmap.Add(pos2).has_value()); + EXPECT_TRUE(bitmap.Add(pos3).has_value()); + EXPECT_TRUE(bitmap.Add(pos4).has_value()); + + EXPECT_TRUE(bitmap.Contains(pos1).value()); + EXPECT_TRUE(bitmap.Contains(pos2).value()); + EXPECT_TRUE(bitmap.Contains(pos3).value()); + EXPECT_TRUE(bitmap.Contains(pos4).value()); + EXPECT_EQ(bitmap.Cardinality(), 4); + EXPECT_GT(bitmap.SerializedSizeInBytes(), 4); +} + +// Mirrors Java's testAddRange +TEST(RoaringPositionBitmapTest, TestAddRange) { + RoaringPositionBitmap bitmap; + + int64_t start = 10; + int64_t end = 20; + EXPECT_TRUE(bitmap.AddRange(start, end).has_value()); + + for (int64_t pos = start; pos < end; ++pos) { + EXPECT_TRUE(bitmap.Contains(pos).value()); + } + EXPECT_FALSE(bitmap.Contains(9).value()); + EXPECT_FALSE(bitmap.Contains(20).value()); + EXPECT_EQ(bitmap.Cardinality(), 10); +} + +// Mirrors Java's testAddRangeAcrossKeys +TEST(RoaringPositionBitmapTest, TestAddRangeAcrossKeys) { + RoaringPositionBitmap bitmap; + + int64_t start = (static_cast(1) << 32) - 5; + int64_t end = (static_cast(1) << 32) + 5; + EXPECT_TRUE(bitmap.AddRange(start, end).has_value()); + + for (int64_t pos = start; pos < end; ++pos) { + EXPECT_TRUE(bitmap.Contains(pos).value()); + } + EXPECT_FALSE(bitmap.Contains(0).value()); + EXPECT_FALSE(bitmap.Contains(end).value()); + EXPECT_EQ(bitmap.Cardinality(), 10); +} + +// Mirrors Java's testAddEmptyRange +TEST(RoaringPositionBitmapTest, TestAddEmptyRange) { + RoaringPositionBitmap bitmap; + EXPECT_TRUE(bitmap.AddRange(10, 10).has_value()); + EXPECT_TRUE(bitmap.IsEmpty()); +} + +// Mirrors Java's testAddAll +TEST(RoaringPositionBitmapTest, TestOr) { + RoaringPositionBitmap bitmap1; + EXPECT_TRUE(bitmap1.Add(10L).has_value()); + EXPECT_TRUE(bitmap1.Add(20L).has_value()); + + RoaringPositionBitmap bitmap2; + EXPECT_TRUE(bitmap2.Add(30L).has_value()); + EXPECT_TRUE(bitmap2.Add(40L).has_value()); + EXPECT_TRUE(bitmap2.Add(static_cast(2) << 32).has_value()); + + bitmap1.Or(bitmap2); + + EXPECT_TRUE(bitmap1.Contains(10L).value()); + EXPECT_TRUE(bitmap1.Contains(20L).value()); + EXPECT_TRUE(bitmap1.Contains(30L).value()); + EXPECT_TRUE(bitmap1.Contains(40L).value()); + EXPECT_TRUE(bitmap1.Contains(static_cast(2) << 32).value()); + EXPECT_EQ(bitmap1.Cardinality(), 5); + + // bitmap2 should be unchanged + EXPECT_FALSE(bitmap2.Contains(10L).value()); + EXPECT_FALSE(bitmap2.Contains(20L).value()); + EXPECT_EQ(bitmap2.Cardinality(), 3); +} + +// Mirrors Java's testAddAllWithEmptyBitmap +TEST(RoaringPositionBitmapTest, TestOrWithEmptyBitmap) { + RoaringPositionBitmap bitmap1; + EXPECT_TRUE(bitmap1.Add(10L).has_value()); + EXPECT_TRUE(bitmap1.Add(20L).has_value()); + + RoaringPositionBitmap empty_bitmap; + bitmap1.Or(empty_bitmap); + + EXPECT_TRUE(bitmap1.Contains(10L).value()); + EXPECT_TRUE(bitmap1.Contains(20L).value()); + EXPECT_EQ(bitmap1.Cardinality(), 2); + + EXPECT_FALSE(empty_bitmap.Contains(10L).value()); + EXPECT_FALSE(empty_bitmap.Contains(20L).value()); + EXPECT_EQ(empty_bitmap.Cardinality(), 0); + EXPECT_TRUE(empty_bitmap.IsEmpty()); +} + +// Mirrors Java's testAddAllWithOverlappingBitmap +TEST(RoaringPositionBitmapTest, TestOrWithOverlapping) { + RoaringPositionBitmap bitmap1; + EXPECT_TRUE(bitmap1.Add(10L).has_value()); + EXPECT_TRUE(bitmap1.Add(20L).has_value()); + EXPECT_TRUE(bitmap1.Add(30L).has_value()); + + RoaringPositionBitmap bitmap2; + EXPECT_TRUE(bitmap2.Add(20L).has_value()); + EXPECT_TRUE(bitmap2.Add(40L).has_value()); + + bitmap1.Or(bitmap2); + + EXPECT_TRUE(bitmap1.Contains(10L).value()); + EXPECT_TRUE(bitmap1.Contains(20L).value()); + EXPECT_TRUE(bitmap1.Contains(30L).value()); + EXPECT_TRUE(bitmap1.Contains(40L).value()); + EXPECT_EQ(bitmap1.Cardinality(), 4); + + EXPECT_FALSE(bitmap2.Contains(10L).value()); + EXPECT_TRUE(bitmap2.Contains(20L).value()); + EXPECT_FALSE(bitmap2.Contains(30L).value()); + EXPECT_TRUE(bitmap2.Contains(40L).value()); + EXPECT_EQ(bitmap2.Cardinality(), 2); +} + +// Mirrors Java's testAddAllSparseBitmaps +TEST(RoaringPositionBitmapTest, TestOrSparseBitmaps) { + RoaringPositionBitmap bitmap1; + EXPECT_TRUE(bitmap1.Add((static_cast(0) << 32) | 100L).has_value()); + EXPECT_TRUE(bitmap1.Add((static_cast(1) << 32) | 200L).has_value()); + + RoaringPositionBitmap bitmap2; + EXPECT_TRUE(bitmap2.Add((static_cast(2) << 32) | 300L).has_value()); + EXPECT_TRUE(bitmap2.Add((static_cast(3) << 32) | 400L).has_value()); + + bitmap1.Or(bitmap2); + + EXPECT_TRUE(bitmap1.Contains((static_cast(0) << 32) | 100L).value()); + EXPECT_TRUE(bitmap1.Contains((static_cast(1) << 32) | 200L).value()); + EXPECT_TRUE(bitmap1.Contains((static_cast(2) << 32) | 300L).value()); + EXPECT_TRUE(bitmap1.Contains((static_cast(3) << 32) | 400L).value()); + EXPECT_EQ(bitmap1.Cardinality(), 4); +} + +// Mirrors Java's testCardinality +TEST(RoaringPositionBitmapTest, TestCardinality) { + RoaringPositionBitmap bitmap; + + EXPECT_EQ(bitmap.Cardinality(), 0); + + EXPECT_TRUE(bitmap.Add(10L).has_value()); + EXPECT_TRUE(bitmap.Add(20L).has_value()); + EXPECT_TRUE(bitmap.Add(30L).has_value()); + EXPECT_EQ(bitmap.Cardinality(), 3); + + EXPECT_TRUE(bitmap.Add(10L).has_value()); // already exists + EXPECT_EQ(bitmap.Cardinality(), 3); +} + +// Mirrors Java's testCardinalitySparseBitmaps +TEST(RoaringPositionBitmapTest, TestCardinalitySparseBitmaps) { + RoaringPositionBitmap bitmap; + + EXPECT_TRUE(bitmap.Add((static_cast(0) << 32) | 100L).has_value()); + EXPECT_TRUE(bitmap.Add((static_cast(0) << 32) | 101L).has_value()); + EXPECT_TRUE(bitmap.Add((static_cast(0) << 32) | 105L).has_value()); + EXPECT_TRUE(bitmap.Add((static_cast(1) << 32) | 200L).has_value()); + EXPECT_TRUE(bitmap.Add((static_cast(100) << 32) | 300L).has_value()); + + EXPECT_EQ(bitmap.Cardinality(), 5); +} + +// Basic serialize/deserialize round trip +TEST(RoaringPositionBitmapTest, TestSerializeDeserializeRoundTrip) { + RoaringPositionBitmap bitmap; + EXPECT_TRUE(bitmap.Add(10L).has_value()); + EXPECT_TRUE(bitmap.Add(20L).has_value()); + EXPECT_TRUE(bitmap.Add((static_cast(1) << 32) | 30L).has_value()); + + auto copy = RoundTripSerialize(bitmap); + + EXPECT_EQ(copy.Cardinality(), bitmap.Cardinality()); + EXPECT_TRUE(copy.Contains(10L).value()); + EXPECT_TRUE(copy.Contains(20L).value()); + EXPECT_TRUE(copy.Contains((static_cast(1) << 32) | 30L).value()); +} + +// Serialize/deserialize empty bitmap +TEST(RoaringPositionBitmapTest, TestSerializeDeserializeEmpty) { + RoaringPositionBitmap bitmap; + auto copy = RoundTripSerialize(bitmap); + EXPECT_TRUE(copy.IsEmpty()); + EXPECT_EQ(copy.Cardinality(), 0); +} + +// Mirrors Java's testSerializeDeserializeAllContainerBitmap +TEST(RoaringPositionBitmapTest, TestSerializeDeserializeAllContainerBitmap) { + RoaringPositionBitmap bitmap; + + // bitmap 0, container 0 (array - few elements) + EXPECT_TRUE(bitmap.Add(Position(0, 0, 5)).has_value()); + EXPECT_TRUE(bitmap.Add(Position(0, 0, 7)).has_value()); + + // bitmap 0, container 1 (array that can be compressed) + EXPECT_TRUE(bitmap.AddRange(Position(0, 1, 1), Position(0, 1, 1000)).has_value()); + + // bitmap 0, container 2 (bitset - nearly full container) + EXPECT_TRUE(bitmap.AddRange(Position(0, 2, 1), Position(0, 2, kContainerOffset - 1)) + .has_value()); + + // bitmap 1, container 0 (array) + EXPECT_TRUE(bitmap.Add(Position(1, 0, 10)).has_value()); + EXPECT_TRUE(bitmap.Add(Position(1, 0, 20)).has_value()); + + // bitmap 1, container 1 (array that can be compressed) + EXPECT_TRUE(bitmap.AddRange(Position(1, 1, 10), Position(1, 1, 500)).has_value()); + + // bitmap 1, container 2 (bitset) + EXPECT_TRUE(bitmap.AddRange(Position(1, 2, 1), Position(1, 2, kContainerOffset - 1)) + .has_value()); + + EXPECT_TRUE(bitmap.RunLengthEncode()); + + auto copy = RoundTripSerialize(bitmap); + + EXPECT_EQ(copy.Cardinality(), bitmap.Cardinality()); + copy.ForEach([&](int64_t pos) { EXPECT_TRUE(bitmap.Contains(pos).value()); }); + bitmap.ForEach([&](int64_t pos) { EXPECT_TRUE(copy.Contains(pos).value()); }); +} + +// Test ForEach iterates in ascending order +TEST(RoaringPositionBitmapTest, TestForEach) { + RoaringPositionBitmap bitmap; + EXPECT_TRUE(bitmap.Add(30L).has_value()); + EXPECT_TRUE(bitmap.Add(10L).has_value()); + EXPECT_TRUE(bitmap.Add(20L).has_value()); + EXPECT_TRUE(bitmap.Add((static_cast(1) << 32) | 5L).has_value()); + + std::vector positions; + bitmap.ForEach([&](int64_t pos) { positions.push_back(pos); }); + + ASSERT_EQ(positions.size(), 4u); + EXPECT_EQ(positions[0], 10L); + EXPECT_EQ(positions[1], 20L); + EXPECT_EQ(positions[2], 30L); + EXPECT_EQ(positions[3], (static_cast(1) << 32) | 5L); +} + +TEST(RoaringPositionBitmapTest, TestIsEmpty) { + RoaringPositionBitmap bitmap; + EXPECT_TRUE(bitmap.IsEmpty()); + + EXPECT_TRUE(bitmap.Add(10L).has_value()); + EXPECT_FALSE(bitmap.IsEmpty()); +} + +TEST(RoaringPositionBitmapTest, TestRunLengthEncode) { + RoaringPositionBitmap bitmap; + EXPECT_TRUE(bitmap.AddRange(0, 10000).has_value()); + + bool changed = bitmap.RunLengthEncode(); + EXPECT_TRUE(changed); + + // Content should be unchanged after RLE optimization + EXPECT_EQ(bitmap.Cardinality(), 10000); + for (int64_t i = 0; i < 10000; ++i) { + EXPECT_TRUE(bitmap.Contains(i).value()); + } + + // Round-trip should preserve content after RLE + auto copy = RoundTripSerialize(bitmap); + EXPECT_EQ(copy.Cardinality(), 10000); +} + +// Mirrors Java's testUnsupportedPositions +TEST(RoaringPositionBitmapTest, TestUnsupportedPositions) { + RoaringPositionBitmap bitmap; + + // Negative position + auto status = bitmap.Add(-1L); + EXPECT_FALSE(status.has_value()); + EXPECT_EQ(status.error().kind, ErrorKind::kInvalidArgument); + + // Contains with negative position + auto result = bitmap.Contains(-1L); + EXPECT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, ErrorKind::kInvalidArgument); + + // Position exceeding MAX_POSITION + status = bitmap.Add(RoaringPositionBitmap::kMaxPosition + 1L); + EXPECT_FALSE(status.has_value()); + EXPECT_EQ(status.error().kind, ErrorKind::kInvalidArgument); + + // Contains with position exceeding MAX_POSITION + result = bitmap.Contains(RoaringPositionBitmap::kMaxPosition + 1L); + EXPECT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, ErrorKind::kInvalidArgument); +} + +// Mirrors Java's testDeserializeSupportedRoaringExamples +TEST(RoaringPositionBitmapTest, TestDeserializeSupportedRoaringExamples) { + for (const auto& file : + {"64map32bitvals.bin", "64mapempty.bin", "64mapspreadvals.bin"}) { + std::string data = ReadTestResource(file); + auto result = RoaringPositionBitmap::Deserialize(data); + EXPECT_TRUE(result.has_value()) + << "Failed to deserialize " << file << ": " << result.error().message; + } +} + +// Mirrors Java's testDeserializeUnsupportedRoaringExample +TEST(RoaringPositionBitmapTest, TestDeserializeUnsupportedRoaringExample) { + // This file contains a value with key larger than max supported + std::string data = ReadTestResource("64maphighvals.bin"); + auto result = RoaringPositionBitmap::Deserialize(data); + EXPECT_FALSE(result.has_value()); + EXPECT_NE(result.error().message.find("Invalid unsigned key"), std::string::npos) + << "Actual error: " << result.error().message; +} + +// Mirrors Java's testRandomSparseBitmap +TEST(RoaringPositionBitmapTest, TestRandomSparseBitmap) { + std::mt19937_64 rng(42); + RoaringPositionBitmap bitmap; + std::set positions; + + std::uniform_int_distribution dist(0, static_cast(5) << 32); + + for (int i = 0; i < 100000; ++i) { + int64_t pos = dist(rng); + positions.insert(pos); + ASSERT_TRUE(bitmap.Add(pos).has_value()); + } + + AssertEqual(bitmap, positions); + + // Random lookups (mirrors Java's assertRandomPositions) + std::mt19937_64 rng2(123); + std::uniform_int_distribution lookup_dist(0, + RoaringPositionBitmap::kMaxPosition); + for (int i = 0; i < 20000; ++i) { + int64_t pos = lookup_dist(rng2); + auto result = bitmap.Contains(pos); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(result.value(), positions.count(pos) > 0); + } +} + +// Mirrors Java's testRandomDenseBitmap +TEST(RoaringPositionBitmapTest, TestRandomDenseBitmap) { + RoaringPositionBitmap bitmap; + std::set positions; + + // Create dense ranges across multiple bitmap keys + for (int64_t offset : {static_cast(0), static_cast(2) << 32, + static_cast(5) << 32}) { + for (int64_t i = 0; i < 10000; ++i) { + ASSERT_TRUE(bitmap.Add(offset + i).has_value()); + positions.insert(offset + i); + } + } + + AssertEqual(bitmap, positions); +} + +// Mirrors Java's testRandomMixedBitmap +TEST(RoaringPositionBitmapTest, TestRandomMixedBitmap) { + std::mt19937_64 rng(42); + RoaringPositionBitmap bitmap; + std::set positions; + + // Sparse positions in [3<<32, 5<<32) + std::uniform_int_distribution dist1(static_cast(3) << 32, + static_cast(5) << 32); + for (int i = 0; i < 50000; ++i) { + int64_t pos = dist1(rng); + positions.insert(pos); + ASSERT_TRUE(bitmap.Add(pos).has_value()); + } + + // Dense range in [0, 10000) + for (int64_t i = 0; i < 10000; ++i) { + ASSERT_TRUE(bitmap.Add(i).has_value()); + positions.insert(i); + } + + // More sparse positions in [0, 1<<32) + std::uniform_int_distribution dist2(0, static_cast(1) << 32); + for (int i = 0; i < 5000; ++i) { + int64_t pos = dist2(rng); + positions.insert(pos); + ASSERT_TRUE(bitmap.Add(pos).has_value()); + } + + AssertEqual(bitmap, positions); +} + +TEST(RoaringPositionBitmapTest, TestDeserializeInvalidData) { + // Buffer too small + auto result = RoaringPositionBitmap::Deserialize(""); + EXPECT_FALSE(result.has_value()); + + // Invalid bitmap count (very large) + std::string buf(8, '\xFF'); + result = RoaringPositionBitmap::Deserialize(buf); + EXPECT_FALSE(result.has_value()); +} + +} // namespace iceberg