From 07a4d771ceae00db7e5df35e11cf5461a3acb2a1 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 14 Mar 2026 23:15:50 +0800 Subject: [PATCH] PARQUET-2249: Add IEEE-754 total order and nan count for floating types --- .../column/statistics/BinaryStatistics.java | 51 ++- .../column/statistics/DoubleStatistics.java | 20 + .../column/statistics/FloatStatistics.java | 20 + .../parquet/column/statistics/Statistics.java | 151 ++++++-- .../columnindex/BinaryColumnIndexBuilder.java | 61 ++- .../column/columnindex/ColumnIndex.java | 8 + .../columnindex/ColumnIndexBuilder.java | 239 +++++++++++- .../columnindex/DoubleColumnIndexBuilder.java | 58 ++- .../columnindex/FloatColumnIndexBuilder.java | 58 ++- .../apache/parquet/schema/ColumnOrder.java | 15 +- .../parquet/schema/LogicalTypeAnnotation.java | 8 + .../parquet/schema/PrimitiveComparator.java | 61 +++ .../apache/parquet/schema/PrimitiveType.java | 64 ++- .../statistics/TestStatisticsNanCount.java | 295 ++++++++++++++ .../TestColumnIndexBuilderNaN.java | 341 ++++++++++++++++ .../schema/TestPrimitiveComparator.java | 119 +++++- .../statisticslevel/StatisticsFilter.java | 166 +++++++- .../converter/ParquetMetadataConverter.java | 40 +- .../statisticslevel/TestStatisticsFilter.java | 365 ++++++++++++++++++ .../TestParquetMetadataConverter.java | 146 +++++++ .../statistics/TestIeee754TotalOrderE2E.java | 322 +++++++++++++++ pom.xml | 2 +- 22 files changed, 2484 insertions(+), 126 deletions(-) create mode 100644 parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatisticsNanCount.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilderNaN.java create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestIeee754TotalOrderE2E.java diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 87d39bf16e..444df95f03 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -19,6 +19,9 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Types; @@ -28,6 +31,7 @@ public class BinaryStatistics extends Statistics { private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("fake_binary_type"); + private final boolean isFloat16; private Binary max; private Binary min; @@ -41,26 +45,51 @@ public BinaryStatistics() { BinaryStatistics(PrimitiveType type) { super(type); + this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation; + if (isFloat16) { + incrementNanCount(0); + } } private BinaryStatistics(BinaryStatistics other) { super(other.type()); + this.isFloat16 = other.isFloat16; if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } setNumNulls(other.getNumNulls()); + incrementNanCount(other.getNanCount()); } @Override public void updateStats(Binary value) { + if (isFloat16 && Float16.isNaN(value.get2BytesLittleEndian())) { + incrementNanCount(); + } if (!this.hasNonNullValue()) { min = value.copy(); max = value.copy(); this.markAsNotEmpty(); - } else if (comparator().compare(min, value) > 0) { - min = value.copy(); - } else if (comparator().compare(max, value) < 0) { - max = value.copy(); + } else { + if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (!Float16.isNaN(value.get2BytesLittleEndian())) { + if (Float16.isNaN(min.get2BytesLittleEndian()) + || comparator().compare(min, value) > 0) { + min = value.copy(); + } + if (Float16.isNaN(max.get2BytesLittleEndian()) + || comparator().compare(max, value) < 0) { + max = value.copy(); + } + } + return; + } + + if (comparator().compare(min, value) > 0) { + min = value.copy(); + } else if (comparator().compare(max, value) < 0) { + max = value.copy(); + } } } @@ -126,6 +155,20 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) { */ @Deprecated public void updateStats(Binary min_value, Binary max_value) { + if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (!Float16.isNaN(min_value.get2BytesLittleEndian())) { + if (Float16.isNaN(min.get2BytesLittleEndian()) || comparator().compare(min, min_value) > 0) { + min = min_value.copy(); + } + } + if (!Float16.isNaN(max_value.get2BytesLittleEndian())) { + if (Float16.isNaN(max.get2BytesLittleEndian()) || comparator().compare(max, max_value) < 0) { + max = max_value.copy(); + } + } + return; + } + if (comparator().compare(min, min_value) > 0) { min = min_value.copy(); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 3fe8a35530..2aad980692 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Types; @@ -41,6 +42,7 @@ public DoubleStatistics() { DoubleStatistics(PrimitiveType type) { super(type); + incrementNanCount(0); } private DoubleStatistics(DoubleStatistics other) { @@ -49,10 +51,14 @@ private DoubleStatistics(DoubleStatistics other) { initializeStats(other.min, other.max); } setNumNulls(other.getNumNulls()); + incrementNanCount(other.getNanCount()); } @Override public void updateStats(double value) { + if (Double.isNaN(value)) { + incrementNanCount(); + } if (!this.hasNonNullValue()) { initializeStats(value, value); } else { @@ -98,6 +104,20 @@ public boolean isSmallerThan(long size) { } public void updateStats(double min_value, double max_value) { + if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (!Double.isNaN(min_value)) { + if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) { + min = min_value; + } + } + if (!Double.isNaN(max_value)) { + if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) { + max = max_value; + } + } + return; + } + if (comparator().compare(min, min_value) > 0) { min = min_value; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 5b743b6884..ce85cf013e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Types; @@ -42,6 +43,7 @@ public FloatStatistics() { FloatStatistics(PrimitiveType type) { super(type); + incrementNanCount(0); } private FloatStatistics(FloatStatistics other) { @@ -50,10 +52,14 @@ private FloatStatistics(FloatStatistics other) { initializeStats(other.min, other.max); } setNumNulls(other.getNumNulls()); + incrementNanCount(other.getNanCount()); } @Override public void updateStats(float value) { + if (Float.isNaN(value)) { + incrementNanCount(); + } if (!this.hasNonNullValue()) { initializeStats(value, value); } else { @@ -99,6 +105,20 @@ public boolean isSmallerThan(long size) { } public void updateStats(float min_value, float max_value) { + if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (!Float.isNaN(min_value)) { + if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) { + min = min_value; + } + } + if (!Float.isNaN(max_value)) { + if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) { + max = max_value; + } + } + return; + } + if (comparator().compare(min, min_value) > 0) { min = min_value; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index bee9877738..fd71447f2d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -19,8 +19,10 @@ package org.apache.parquet.column.statistics; import java.util.Arrays; +import org.apache.parquet.Preconditions; import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.Float16; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveComparator; @@ -40,10 +42,11 @@ public abstract class Statistics> { * Builder class to build Statistics objects. Used to read the statistics from the Parquet file. */ public static class Builder { - private final PrimitiveType type; + protected final PrimitiveType type; private byte[] min; private byte[] max; private long numNulls = -1; + private long nanCount = -1; private Builder(PrimitiveType type) { this.type = type; @@ -64,12 +67,21 @@ public Builder withNumNulls(long numNulls) { return this; } + public Builder withNanCount(long nanCount) { + this.nanCount = nanCount; + return this; + } + public Statistics build() { Statistics stats = createStats(type); if (min != null && max != null) { stats.setMinMaxFromBytes(min, max); } stats.num_nulls = this.numNulls; + stats.nan_count = this.nanCount; + Preconditions.checkState( + !type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()) || stats.nan_count >= 0, + "nan_count is required by IEEE 754 column order with type " + type); return stats; } } @@ -87,19 +99,24 @@ public Statistics build() { if (stats.hasNonNullValue()) { Float min = stats.genericGetMin(); Float max = stats.genericGetMax(); - // Drop min/max values in case of NaN as the sorting order of values is undefined for this case if (min.isNaN() || max.isNaN()) { - stats.setMinMax(0.0f, 0.0f); - ((Statistics) stats).hasNonNullValue = false; - } else { - // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped - if (Float.compare(min, 0.0f) == 0) { - min = -0.0f; - stats.setMinMax(min, max); + if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + // For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined + stats.setMinMax(0.0f, 0.0f); + ((Statistics) stats).hasNonNullValue = false; } - if (Float.compare(max, -0.0f) == 0) { - max = 0.0f; - stats.setMinMax(min, max); + } else { + // For TYPE_DEFINED_ORDER, updating min to -0.0 and max to +0.0 to ensure that no 0.0 values + // would be skipped. For IEEE_754_TOTAL_ORDER, -0 < +0 is well-defined so no expansion needed. + if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (Float.compare(min, 0.0f) == 0) { + min = -0.0f; + stats.setMinMax(min, max); + } + if (Float.compare(max, -0.0f) == 0) { + max = 0.0f; + stats.setMinMax(min, max); + } } } } @@ -120,19 +137,24 @@ public Statistics build() { if (stats.hasNonNullValue()) { Double min = stats.genericGetMin(); Double max = stats.genericGetMax(); - // Drop min/max values in case of NaN as the sorting order of values is undefined for this case if (min.isNaN() || max.isNaN()) { - stats.setMinMax(0.0, 0.0); - ((Statistics) stats).hasNonNullValue = false; - } else { - // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped - if (Double.compare(min, 0.0) == 0) { - min = -0.0; - stats.setMinMax(min, max); + if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + // For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined + stats.setMinMax(0.0, 0.0); + ((Statistics) stats).hasNonNullValue = false; } - if (Double.compare(max, -0.0) == 0) { - max = 0.0; - stats.setMinMax(min, max); + } else { + // For TYPE_DEFINED_ORDER, updating min to -0.0 and max to +0.0 to ensure that no 0.0 values + // would be skipped. For IEEE_754_TOTAL_ORDER, -0 < +0 is well-defined so no expansion needed. + if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (Double.compare(min, 0.0) == 0) { + min = -0.0; + stats.setMinMax(min, max); + } + if (Double.compare(max, -0.0) == 0) { + max = 0.0; + stats.setMinMax(min, max); + } } } } @@ -156,19 +178,24 @@ public Statistics build() { Binary bMax = stats.genericGetMax(); short min = bMin.get2BytesLittleEndian(); short max = bMax.get2BytesLittleEndian(); - // Drop min/max values in case of NaN as the sorting order of values is undefined for this case if (Float16.isNaN(min) || Float16.isNaN(max)) { - stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, Float16.POSITIVE_ZERO_LITTLE_ENDIAN); - ((Statistics) stats).hasNonNullValue = false; - } else { - // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped - if (min == (short) 0x0000) { - bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN; - stats.setMinMax(bMin, bMax); + if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + // For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined + stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, Float16.POSITIVE_ZERO_LITTLE_ENDIAN); + ((Statistics) stats).hasNonNullValue = false; } - if (max == (short) 0x8000) { - bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN; - stats.setMinMax(bMin, bMax); + } else { + // For TYPE_DEFINED_ORDER, updating min to -0.0 and max to +0.0 to ensure that no 0.0 values + // would be skipped. For IEEE_754_TOTAL_ORDER, -0 < +0 is well-defined so no expansion needed. + if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) { + if (min == (short) 0x0000) { + bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN; + stats.setMinMax(bMin, bMax); + } + if (max == (short) 0x8000) { + bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN; + stats.setMinMax(bMin, bMax); + } } } } @@ -180,6 +207,7 @@ public Statistics build() { private final PrimitiveComparator comparator; private boolean hasNonNullValue; private long num_nulls; + private long nan_count = -1; final PrimitiveStringifier stringifier; Statistics(PrimitiveType type) { @@ -349,7 +377,8 @@ public boolean equals(Object other) { return type.equals(stats.type) && Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && Arrays.equals(stats.getMinBytes(), this.getMinBytes()) - && stats.getNumNulls() == this.getNumNulls(); + && stats.getNumNulls() == this.getNumNulls() + && stats.getNanCount() == this.getNanCount(); } /** @@ -382,6 +411,11 @@ public void mergeStatistics(Statistics stats) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } + if (isNanCountSet() && stats.isNanCountSet()) { + incrementNanCount(stats.getNanCount()); + } else { + unsetNanCount(); + } } else { throw StatisticsClassException.create(this, stats); } @@ -533,6 +567,53 @@ public void incrementNumNulls(long increment) { num_nulls += increment; } + /** + * Increments the NaN count by one. If nan_count was not set (-1), initializes it to 1. + */ + public void incrementNanCount() { + if (nan_count < 0) { + nan_count = 1; + } else { + nan_count++; + } + } + + /** + * Increments the NaN count by the parameter value. If nan_count was not set (-1), initializes it to increment. + * + * @param increment value to increment the NaN count by + */ + public void incrementNanCount(long increment) { + if (nan_count < 0) { + nan_count = increment; + } else { + nan_count += increment; + } + } + + /** + * Returns the NaN count + * + * @return NaN count or {@code -1} if the NaN count is not set + */ + public long getNanCount() { + return nan_count; + } + + /** + * @return whether nanCount is set and can be used + */ + public boolean isNanCountSet() { + return nan_count >= 0; + } + + /** + * Unsets the NaN count to -1. + */ + public void unsetNanCount() { + nan_count = -1; + } + /** * Returns the null count * diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java index 24de97d01e..7f40721669 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.parquet.filter2.predicate.Statistics; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.Float16; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveComparator; @@ -32,9 +33,36 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder { private static class BinaryColumnIndex extends ColumnIndexBase { private Binary[] minValues; private Binary[] maxValues; + private final boolean isFloat16; + private final boolean isIeee754TotalOrder; private BinaryColumnIndex(PrimitiveType type) { super(type); + isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation; + isIeee754TotalOrder = type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()); + } + + @Override + boolean mayHaveNaNPollutedMinMax() { + return isFloat16 && !isIeee754TotalOrder; + } + + @Override + boolean isNaNLiteral(Object value) { + return isFloat16 + && value instanceof Binary + && ((Binary) value).length() == LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES + && Float16.isNaN(((Binary) value).get2BytesLittleEndian()); + } + + @Override + boolean isMinNaN(int arrayIndex) { + return isFloat16 && Float16.isNaN(minValues[arrayIndex].get2BytesLittleEndian()); + } + + @Override + boolean isMaxNaN(int arrayIndex) { + return isFloat16 && Float16.isNaN(maxValues[arrayIndex].get2BytesLittleEndian()); } @Override @@ -80,11 +108,13 @@ int compareValueToMax(int arrayIndex) { } } + private static final Binary FLOAT16_NAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x7e}); private final List minValues = new ArrayList<>(); private final List maxValues = new ArrayList<>(); private final BinaryTruncator truncator; private final int truncateLength; private final boolean isFloat16; + private final boolean isIeee754TotalOrder; private boolean invalid; private static Binary convert(ByteBuffer buffer) { @@ -99,6 +129,7 @@ private static ByteBuffer convert(Binary value) { truncator = BinaryTruncator.getTruncator(type); this.truncateLength = truncateLength; this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation; + this.isIeee754TotalOrder = type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()); } @Override @@ -122,17 +153,24 @@ void addMinMax(Object min, Object max) { short sMax = bMax.get2BytesLittleEndian(); if (Float16.isNaN(sMin) || Float16.isNaN(sMax)) { - invalid = true; + if (isIeee754TotalOrder) { + bMin = FLOAT16_NAN; + bMax = FLOAT16_NAN; + } else { + invalid = true; + } } - // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to - // ensure that no 0.0 values are skipped + // For TYPE_DEFINED_ORDER, sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 + // to ensure that no 0.0 values are skipped. For IEEE_754_TOTAL_ORDER, -0 < +0 is well-defined. // +0.0 is 0x0000, -0.0 is 0x8000 (little endian: 00 00, 00 80) - if (sMin == (short) 0x0000) { - bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN; - } - if (sMax == (short) 0x8000) { - bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN; + if (!isIeee754TotalOrder) { + if (sMin == (short) 0x0000) { + bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN; + } + if (sMax == (short) 0x8000) { + bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN; + } } } } @@ -141,6 +179,13 @@ void addMinMax(Object min, Object max) { maxValues.add(bMax == null ? null : truncator.truncateMax(bMax, truncateLength)); } + @Override + void onNanEncountered() { + if (isFloat16 && !isIeee754TotalOrder) { + invalid = true; + } + } + @Override ColumnIndexBase createColumnIndex(PrimitiveType type) { if (invalid) { diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java index 86099717df..1cf73784c0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java @@ -71,4 +71,12 @@ default List getRepetitionLevelHistogram() { default List getDefinitionLevelHistogram() { throw new UnsupportedOperationException("Definition level histogram is not implemented"); } + + /** + * @return the unmodifiable list of NaN counts for each page, or {@code null} if NaN counts are not available; + * used for converting to the related thrift object + */ + default List getNanCounts() { + return null; + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java index e78b2ceae1..c15c6c914e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java @@ -32,11 +32,13 @@ import it.unimi.dsi.fastutil.longs.LongLists; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.Formatter; import java.util.List; import java.util.PrimitiveIterator; import java.util.Set; import java.util.function.IntPredicate; +import org.apache.parquet.Preconditions; import org.apache.parquet.column.MinMax; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; @@ -56,6 +58,7 @@ import org.apache.parquet.filter2.predicate.Operators.UserDefined; import org.apache.parquet.filter2.predicate.UserDefinedPredicate; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.PrimitiveComparator; import org.apache.parquet.schema.PrimitiveStringifier; import org.apache.parquet.schema.PrimitiveType; @@ -102,6 +105,8 @@ int translate(int arrayIndex) { // might be null private long[] nullCounts; // might be null + private long[] nanCounts; + // might be null private long[] repLevelHistogram; // might be null private long[] defLevelHistogram; @@ -133,6 +138,14 @@ public List getNullCounts() { return LongLists.unmodifiable(LongArrayList.wrap(nullCounts)); } + @Override + public List getNanCounts() { + if (nanCounts == null) { + return null; + } + return LongLists.unmodifiable(LongArrayList.wrap(nanCounts)); + } + @Override public List getNullPages() { return BooleanLists.unmodifiable(BooleanArrayList.wrap(nullPages)); @@ -242,6 +255,42 @@ boolean isNullPage(int pageIndex) { return nullPages[pageIndex]; } + private int getArrayIndex(int pageIndex) { + return Arrays.binarySearch(pageIndexes, pageIndex); + } + + // Returns true if this ColumnIndex may have NaN-polluted min/max values that cannot be trusted. + // If true, we need to conservatively include pages with NaN in min/max for some predicates. + boolean mayHaveNaNPollutedMinMax() { + return false; + } + + // Returns true if the given predicate value is NaN for this column's type. Override in typed subclasses. + boolean isNaNLiteral(Object value) { + return false; + } + + // Returns true if the min value at arrayIndex is NaN. Override in typed subclasses. + boolean isMinNaN(int arrayIndex) { + return false; + } + + // Returns true if the max value at arrayIndex is NaN. Override in typed subclasses. + boolean isMaxNaN(int arrayIndex) { + return false; + } + + // Returns true if the min or max value at arrayIndex is NaN. + boolean hasNaNMinMax(int arrayIndex) { + return isMinNaN(arrayIndex) || isMaxNaN(arrayIndex); + } + + // Returns true if this page is a confirmed all-NaN page: + // min==max==NaN and nanCounts confirms at least one NaN value. + private boolean isAllNaNs(int arrayIndex, int pageIndex) { + return nanCounts != null && nanCounts[pageIndex] > 0 && isMinNaN(arrayIndex) && isMaxNaN(arrayIndex); + } + /* * Returns the min value for arrayIndex as a ByteBuffer. (Min values are not stored for null-pages so arrayIndex * might not equal to pageIndex.) @@ -273,6 +322,19 @@ abstract > org.apache.parquet.filter2.predicate.Statisti /* Creates a ValueComparator object containing the specified value to be compared for min/max values */ abstract ValueComparator createValueComparator(Object value); + // Wraps an iterator result to conservatively include pages whose min or max is NaN. + private PrimitiveIterator.OfInt includeNaNMinMaxPages(PrimitiveIterator.OfInt result) { + IntSet pages = new IntOpenHashSet(); + result.forEachRemaining((int pageIndex) -> pages.add(pageIndex)); + // Also include non-null pages where min or max is NaN + for (int i = 0; i < pageIndexes.length; i++) { + if (hasNaNMinMax(i)) { + pages.add(pageIndexes[i]); + } + } + return IndexIterator.filter(getPageCount(), pages::contains); + } + @Override public PrimitiveIterator.OfInt visit(And and) { throw new UnsupportedOperationException("AND shall not be used on column index directly"); @@ -299,27 +361,63 @@ public > PrimitiveIterator.OfInt visit(Eq eq) { return IndexIterator.filter(getPageCount(), pageIndex -> nullCounts[pageIndex] > 0); } } + if (isNaNLiteral(value)) { + return nanCounts == null + ? IndexIterator.all(getPageCount()) + : IndexIterator.filter(getPageCount(), pageIndex -> nanCounts[pageIndex] > 0); + } + if (mayHaveNaNPollutedMinMax()) { + return includeNaNMinMaxPages(getBoundaryOrder().eq(createValueComparator(value))); + } return getBoundaryOrder().eq(createValueComparator(value)); } @Override public > PrimitiveIterator.OfInt visit(Gt gt) { - return getBoundaryOrder().gt(createValueComparator(gt.getValue())); + T value = gt.getValue(); + if (isNaNLiteral(value)) { + return IndexIterator.all(getPageCount()); + } + if (mayHaveNaNPollutedMinMax()) { + return includeNaNMinMaxPages(getBoundaryOrder().gt(createValueComparator(value))); + } + return getBoundaryOrder().gt(createValueComparator(value)); } @Override public > PrimitiveIterator.OfInt visit(GtEq gtEq) { - return getBoundaryOrder().gtEq(createValueComparator(gtEq.getValue())); + T value = gtEq.getValue(); + if (isNaNLiteral(value)) { + return IndexIterator.all(getPageCount()); + } + if (mayHaveNaNPollutedMinMax()) { + return includeNaNMinMaxPages(getBoundaryOrder().gtEq(createValueComparator(value))); + } + return getBoundaryOrder().gtEq(createValueComparator(value)); } @Override public > PrimitiveIterator.OfInt visit(Lt lt) { - return getBoundaryOrder().lt(createValueComparator(lt.getValue())); + T value = lt.getValue(); + if (isNaNLiteral(value)) { + return IndexIterator.all(getPageCount()); + } + if (mayHaveNaNPollutedMinMax()) { + return includeNaNMinMaxPages(getBoundaryOrder().lt(createValueComparator(value))); + } + return getBoundaryOrder().lt(createValueComparator(value)); } @Override public > PrimitiveIterator.OfInt visit(LtEq ltEq) { - return getBoundaryOrder().ltEq(createValueComparator(ltEq.getValue())); + T value = ltEq.getValue(); + if (isNaNLiteral(value)) { + return IndexIterator.all(getPageCount()); + } + if (mayHaveNaNPollutedMinMax()) { + return includeNaNMinMaxPages(getBoundaryOrder().ltEq(createValueComparator(value))); + } + return getBoundaryOrder().ltEq(createValueComparator(value)); } @Override @@ -329,6 +427,12 @@ public > PrimitiveIterator.OfInt visit(NotEq notEq) { return IndexIterator.filter(getPageCount(), pageIndex -> !nullPages[pageIndex]); } + if (isNaNLiteral(value)) { + return nanCounts == null + ? IndexIterator.all(getPageCount()) + : IndexIterator.filter(getPageCount(), pageIndex -> nanCounts[pageIndex] == 0); + } + if (nullCounts == null) { // Nulls match so if we don't have null related statistics we have to return all pages return IndexIterator.all(getPageCount()); @@ -336,9 +440,14 @@ public > PrimitiveIterator.OfInt visit(NotEq notEq) { // Merging value filtering with pages containing nulls IntSet matchingIndexes = new IntOpenHashSet(); - getBoundaryOrder() - .notEq(createValueComparator(value)) - .forEachRemaining((int index) -> matchingIndexes.add(index)); + if (mayHaveNaNPollutedMinMax()) { + includeNaNMinMaxPages(getBoundaryOrder().notEq(createValueComparator(value))) + .forEachRemaining((int index) -> matchingIndexes.add(index)); + } else { + getBoundaryOrder() + .notEq(createValueComparator(value)) + .forEachRemaining((int index) -> matchingIndexes.add(index)); + } return IndexIterator.filter( getPageCount(), pageIndex -> nullCounts[pageIndex] > 0 || matchingIndexes.contains(pageIndex)); } @@ -362,6 +471,19 @@ public > PrimitiveIterator.OfInt visit(In in) { return IndexIterator.filter(getPageCount(), matchingIndexesForNull::contains); } } + } else if (isNaNLiteral(value)) { + if (nanCounts == null || values.size() != 1) { + // We don't know if NaN exists in any page, or we can't rely on ltEq/gtEq on NaN, return all + // pages. + return IndexIterator.all(getPageCount()); + } + IntSet matchingIndexesForNaN = new IntOpenHashSet(); + for (int i = 0; i < nanCounts.length; i++) { + if (nanCounts[i] > 0) { + matchingIndexesForNaN.add(i); + } + } + return IndexIterator.filter(getPageCount(), matchingIndexesForNaN::contains); } } @@ -517,6 +639,7 @@ public long getMinMaxSize() { private PrimitiveType type; private final BooleanList nullPages = new BooleanArrayList(); private final LongList nullCounts = new LongArrayList(); + private LongList nanCounts = new LongArrayList(); private final IntList pageIndexes = new IntArrayList(); private int nextPageIndex; private LongList repLevelHistogram = new LongArrayList(); @@ -550,9 +673,9 @@ private static ColumnIndexBuilder createNewBuilder(PrimitiveType type, int trunc case BOOLEAN: return new BooleanColumnIndexBuilder(); case DOUBLE: - return new DoubleColumnIndexBuilder(); + return new DoubleColumnIndexBuilder(type); case FLOAT: - return new FloatColumnIndexBuilder(); + return new FloatColumnIndexBuilder(type); case INT32: return new IntColumnIndexBuilder(); case INT64: @@ -611,10 +734,53 @@ public static ColumnIndex build( List maxValues, List repLevelHistogram, List defLevelHistogram) { + return build( + type, + boundaryOrder, + nullPages, + nullCounts, + null, + minValues, + maxValues, + repLevelHistogram, + defLevelHistogram); + } - ColumnIndexBuilder builder = createNewBuilder(type, Integer.MAX_VALUE); + /** + * @param type + * the primitive type + * @param boundaryOrder + * the boundary order of the min/max values + * @param nullPages + * the null pages (one boolean value for each page that signifies whether the page consists of nulls + * entirely) + * @param nullCounts + * the number of null values for each page + * @param nanCounts + * the number of NaN values for each page (may be null) + * @param minValues + * the min values for each page + * @param maxValues + * the max values for each page + * @param repLevelHistogram + * the repetition level histogram for all levels of each page + * @param defLevelHistogram + * the definition level histogram for all levels of each page + * @return the newly created {@link ColumnIndex} object based on the specified arguments + */ + public static ColumnIndex build( + PrimitiveType type, + BoundaryOrder boundaryOrder, + List nullPages, + List nullCounts, + List nanCounts, + List minValues, + List maxValues, + List repLevelHistogram, + List defLevelHistogram) { - builder.fill(nullPages, nullCounts, minValues, maxValues, repLevelHistogram, defLevelHistogram); + ColumnIndexBuilder builder = createNewBuilder(type, Integer.MAX_VALUE); + builder.fill(nullPages, nullCounts, nanCounts, minValues, maxValues, repLevelHistogram, defLevelHistogram); ColumnIndexBase columnIndex = builder.build(type); columnIndex.boundaryOrder = requireNonNull(boundaryOrder); return columnIndex; @@ -653,6 +819,15 @@ public void add(Statistics stats, SizeStatistics sizeStats) { } nullCounts.add(stats.getNumNulls()); + if (nanCounts != null && stats.isNanCountSet()) { + nanCounts.add(stats.getNanCount()); + if (stats.getNanCount() > 0) { + onNanEncountered(); + } + } else { + nanCounts = null; + } + // Collect repetition and definition level histograms only when all pages are valid. if (sizeStats != null && sizeStats.isValid() && repLevelHistogram != null && defLevelHistogram != null) { repLevelHistogram.addAll(sizeStats.getRepetitionLevelHistogram()); @@ -669,9 +844,18 @@ public void add(Statistics stats, SizeStatistics sizeStats) { abstract void addMinMax(Object min, Object max); + /** + * Called when a page with NaN values is encountered (nan_count > 0). + * Subclasses should override to handle NaN presence (e.g., invalidate for TYPE_DEFINED_ORDER). + */ + void onNanEncountered() { + throw new UnsupportedOperationException("Cannot call onNanEncountered on type: " + type); + } + private void fill( List nullPages, List nullCounts, + List nanCounts, List minValues, List maxValues, List repLevelHistogram, @@ -679,12 +863,14 @@ private void fill( clear(); int pageCount = nullPages.size(); if ((nullCounts != null && nullCounts.size() != pageCount) + || (nanCounts != null && nanCounts.size() != pageCount) || minValues.size() != pageCount || maxValues.size() != pageCount) { throw new IllegalArgumentException(String.format( - "Not all sizes are equal (nullPages:%d, nullCounts:%s, minValues:%d, maxValues:%d", + "Not all sizes are equal (nullPages:%d, nullCounts:%s, nanCounts:%s, minValues:%d, maxValues:%d", nullPages.size(), nullCounts == null ? "null" : nullCounts.size(), + nanCounts == null ? "null" : nanCounts.size(), minValues.size(), maxValues.size())); } @@ -705,6 +891,10 @@ private void fill( if (nullCounts != null) { this.nullCounts.addAll(nullCounts); } + // NaN counts is optional in the format + if (nanCounts != null) { + this.nanCounts.addAll(nanCounts); + } for (int i = 0; i < pageCount; ++i) { if (!nullPages.get(i)) { @@ -750,6 +940,14 @@ private ColumnIndexBase build(PrimitiveType type) { if (!nullCounts.isEmpty()) { columnIndex.nullCounts = nullCounts.toLongArray(); } + // NaN counts is optional so keep it null if the builder has no values + if (nanCounts != null && !nanCounts.isEmpty()) { + columnIndex.nanCounts = nanCounts.toLongArray(); + } else { + Preconditions.checkState( + !type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()), + "NaN counts must be provided for types with IEEE_754_TOTAL_ORDER column order"); + } columnIndex.pageIndexes = pageIndexes.toIntArray(); // Repetition and definition level histograms are optional so keep them null if the builder has no values if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) { @@ -802,8 +1000,21 @@ private void clear() { clearMinMax(); nextPageIndex = 0; pageIndexes.clear(); - repLevelHistogram.clear(); - defLevelHistogram.clear(); + if (nanCounts != null) { + nanCounts.clear(); + } else { + nanCounts = new LongArrayList(); + } + if (repLevelHistogram == null) { + repLevelHistogram = new LongArrayList(); + } else { + repLevelHistogram.clear(); + } + if (defLevelHistogram == null) { + defLevelHistogram = new LongArrayList(); + } else { + defLevelHistogram.clear(); + } } abstract void clearMinMax(); diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java index 5d5d54aa76..677122075d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java @@ -25,6 +25,7 @@ import java.nio.ByteBuffer; import org.apache.parquet.filter2.predicate.Statistics; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.PrimitiveComparator; import org.apache.parquet.schema.PrimitiveType; @@ -32,9 +33,31 @@ class DoubleColumnIndexBuilder extends ColumnIndexBuilder { private static class DoubleColumnIndex extends ColumnIndexBase { private double[] minValues; private double[] maxValues; + private final boolean isIeee754TotalOrder; private DoubleColumnIndex(PrimitiveType type) { super(type); + isIeee754TotalOrder = type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()); + } + + @Override + boolean mayHaveNaNPollutedMinMax() { + return !isIeee754TotalOrder; + } + + @Override + boolean isNaNLiteral(Object value) { + return Double.isNaN((double) value); + } + + @Override + boolean isMinNaN(int arrayIndex) { + return Double.isNaN(minValues[arrayIndex]); + } + + @Override + boolean isMaxNaN(int arrayIndex) { + return Double.isNaN(maxValues[arrayIndex]); } @Override @@ -83,6 +106,11 @@ int compareValueToMax(int arrayIndex) { private final DoubleList minValues = new DoubleArrayList(); private final DoubleList maxValues = new DoubleArrayList(); private boolean invalid; + private final boolean isIeee754TotalOrder; + + DoubleColumnIndexBuilder(PrimitiveType type) { + this.isIeee754TotalOrder = type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()); + } private static double convert(ByteBuffer buffer) { return buffer.order(LITTLE_ENDIAN).getDouble(0); @@ -103,22 +131,36 @@ void addMinMax(Object min, Object max) { double dMin = (double) min; double dMax = (double) max; if (Double.isNaN(dMin) || Double.isNaN(dMax)) { - // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case - invalid = true; + if (isIeee754TotalOrder) { + dMin = Double.NaN; + dMax = Double.NaN; + } else { + invalid = true; + } } - // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped - if (Double.compare(dMin, +0.0) == 0) { - dMin = -0.0; - } - if (Double.compare(dMax, -0.0) == 0) { - dMax = +0.0; + // For TYPE_DEFINED_ORDER, sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 + // to ensure that no 0.0 values are skipped. For IEEE_754_TOTAL_ORDER, -0 < +0 is well-defined. + if (!isIeee754TotalOrder) { + if (Double.compare(dMin, +0.0) == 0) { + dMin = -0.0; + } + if (Double.compare(dMax, -0.0) == 0) { + dMax = +0.0; + } } minValues.add(dMin); maxValues.add(dMax); } + @Override + void onNanEncountered() { + if (!isIeee754TotalOrder) { + invalid = true; + } + } + @Override ColumnIndexBase createColumnIndex(PrimitiveType type) { if (invalid) { diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java index be66f85d15..125f4877a5 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java @@ -25,6 +25,7 @@ import java.nio.ByteBuffer; import org.apache.parquet.filter2.predicate.Statistics; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.PrimitiveComparator; import org.apache.parquet.schema.PrimitiveType; @@ -32,9 +33,31 @@ class FloatColumnIndexBuilder extends ColumnIndexBuilder { private static class FloatColumnIndex extends ColumnIndexBase { private float[] minValues; private float[] maxValues; + private final boolean isIeee754TotalOrder; private FloatColumnIndex(PrimitiveType type) { super(type); + isIeee754TotalOrder = type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()); + } + + @Override + boolean mayHaveNaNPollutedMinMax() { + return !isIeee754TotalOrder; + } + + @Override + boolean isNaNLiteral(Object value) { + return Float.isNaN((float) value); + } + + @Override + boolean isMinNaN(int arrayIndex) { + return Float.isNaN(minValues[arrayIndex]); + } + + @Override + boolean isMaxNaN(int arrayIndex) { + return Float.isNaN(maxValues[arrayIndex]); } @Override @@ -83,6 +106,11 @@ int compareValueToMax(int arrayIndex) { private final FloatList minValues = new FloatArrayList(); private final FloatList maxValues = new FloatArrayList(); private boolean invalid; + private final boolean isIeee754TotalOrder; + + FloatColumnIndexBuilder(PrimitiveType type) { + this.isIeee754TotalOrder = type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()); + } private static float convert(ByteBuffer buffer) { return buffer.order(LITTLE_ENDIAN).getFloat(0); @@ -103,22 +131,36 @@ void addMinMax(Object min, Object max) { float fMin = (float) min; float fMax = (float) max; if (Float.isNaN(fMin) || Float.isNaN(fMax)) { - // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case - invalid = true; + if (isIeee754TotalOrder) { + fMin = Float.NaN; + fMax = Float.NaN; + } else { + invalid = true; + } } - // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped - if (Float.compare(fMin, +0.0f) == 0) { - fMin = -0.0f; - } - if (Float.compare(fMax, -0.0f) == 0) { - fMax = +0.0f; + // For TYPE_DEFINED_ORDER, sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 + // to ensure that no 0.0 values are skipped. For IEEE_754_TOTAL_ORDER, -0 < +0 is well-defined. + if (!isIeee754TotalOrder) { + if (Float.compare(fMin, +0.0f) == 0) { + fMin = -0.0f; + } + if (Float.compare(fMax, -0.0f) == 0) { + fMax = +0.0f; + } } minValues.add(fMin); maxValues.add(fMax); } + @Override + void onNanEncountered() { + if (!isIeee754TotalOrder) { + invalid = true; + } + } + @Override ColumnIndexBase createColumnIndex(PrimitiveType type) { if (invalid) { diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java b/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java index 94a1275569..35ef0ec9d2 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java @@ -36,11 +36,16 @@ public enum ColumnOrderName { /** * Type defined order meaning that the comparison order of the elements are based on its type. */ - TYPE_DEFINED_ORDER + TYPE_DEFINED_ORDER, + /** + * The column order is defined by the IEEE 754 standard. + */ + IEEE_754_TOTAL_ORDER, } private static final ColumnOrder UNDEFINED_COLUMN_ORDER = new ColumnOrder(ColumnOrderName.UNDEFINED); private static final ColumnOrder TYPE_DEFINED_COLUMN_ORDER = new ColumnOrder(ColumnOrderName.TYPE_DEFINED_ORDER); + private static final ColumnOrder IEEE_754_TOTAL_ORDER = new ColumnOrder(ColumnOrderName.IEEE_754_TOTAL_ORDER); /** * @return a {@link ColumnOrder} instance representing an undefined order @@ -58,6 +63,14 @@ public static ColumnOrder typeDefined() { return TYPE_DEFINED_COLUMN_ORDER; } + /** + * @return a {@link ColumnOrder} instance representing an IEEE 754 total order + * @see ColumnOrderName#IEEE_754_TOTAL_ORDER + */ + public static ColumnOrder ieee754TotalOrder() { + return IEEE_754_TOTAL_ORDER; + } + private final ColumnOrderName columnOrderName; private ColumnOrder(ColumnOrderName columnOrderName) { diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 98bc5c0237..625e9fd9d3 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -19,6 +19,7 @@ package org.apache.parquet.schema; import static java.util.Optional.empty; +import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.IEEE_754_TOTAL_ORDER; import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.TYPE_DEFINED_ORDER; import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.UNDEFINED; import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MICROS_STRINGIFIER; @@ -1047,6 +1048,13 @@ LogicalTypeToken getType() { PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { return PrimitiveStringifier.FLOAT16_STRINGIFIER; } + + @Override + boolean isValidColumnOrder(ColumnOrder columnOrder) { + return columnOrder.getColumnOrderName() == UNDEFINED + || columnOrder.getColumnOrderName() == TYPE_DEFINED_ORDER + || columnOrder.getColumnOrderName() == IEEE_754_TOTAL_ORDER; + } } public static class UnknownLogicalTypeAnnotation extends LogicalTypeAnnotation { diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 50c4acd4c9..9d22d25312 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -293,4 +293,65 @@ public String toString() { return "BINARY_AS_FLOAT16_COMPARATOR"; } }; + + static final PrimitiveComparator FLOAT_IEEE_754_TOTAL_ORDER_COMPARATOR = new PrimitiveComparator() { + @Override + int compareNotNulls(Float o1, Float o2) { + return compare(o1.floatValue(), o2.floatValue()); + } + + @Override + public int compare(float f1, float f2) { + int f1Int = Float.floatToRawIntBits(f1); + int f2Int = Float.floatToRawIntBits(f2); + f1Int ^= ((f1Int >> 31) >>> 1); + f2Int ^= ((f2Int >> 31) >>> 1); + return Integer.compare(f1Int, f2Int); + } + + @Override + public String toString() { + return "FLOAT_IEEE_754_TOTAL_ORDER_COMPARATOR"; + } + }; + + static final PrimitiveComparator DOUBLE_IEEE_754_TOTAL_ORDER_COMPARATOR = + new PrimitiveComparator() { + @Override + int compareNotNulls(Double o1, Double o2) { + return compare(o1.doubleValue(), o2.doubleValue()); + } + + @Override + public int compare(double d1, double d2) { + long d1Long = Double.doubleToRawLongBits(d1); + long d2Long = Double.doubleToRawLongBits(d2); + d1Long ^= ((d1Long >> 63) >>> 1); + d2Long ^= ((d2Long >> 63) >>> 1); + return Long.compare(d1Long, d2Long); + } + + @Override + public String toString() { + return "DOUBLE_IEEE_754_TOTAL_ORDER_COMPARATOR"; + } + }; + + static final PrimitiveComparator BINARY_AS_FLOAT16_IEEE_754_TOTAL_ORDER_COMPARATOR = + new BinaryComparator() { + + @Override + int compareBinary(Binary b1, Binary b2) { + int b1Short = b1.get2BytesLittleEndian(); + int b2Short = b2.get2BytesLittleEndian(); + b1Short ^= ((b1Short >> 15) >>> 1); + b2Short ^= ((b2Short >> 15) >>> 1); + return Integer.compare(b1Short, b2Short); + } + + @Override + public String toString() { + return "BINARY_AS_FLOAT16_IEEE_754_TOTAL_ORDER_COMPARATOR"; + } + }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 944cfb58eb..184e22548e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -88,7 +88,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { if (logicalType == null) { return PrimitiveComparator.SIGNED_INT64_COMPARATOR; } @@ -152,7 +152,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { if (logicalType == null) { return PrimitiveComparator.SIGNED_INT32_COMPARATOR; } @@ -222,7 +222,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { return PrimitiveComparator.BOOLEAN_COMPARATOR; } }, @@ -248,7 +248,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { if (logicalType == null) { return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; } @@ -328,8 +328,10 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { - return PrimitiveComparator.FLOAT_COMPARATOR; + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { + return columnOrder != null && columnOrder.getColumnOrderName() == ColumnOrderName.IEEE_754_TOTAL_ORDER + ? PrimitiveComparator.FLOAT_IEEE_754_TOTAL_ORDER_COMPARATOR + : PrimitiveComparator.FLOAT_COMPARATOR; } }, DOUBLE("getDouble", Double.TYPE) { @@ -354,8 +356,10 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { - return PrimitiveComparator.DOUBLE_COMPARATOR; + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { + return columnOrder != null && columnOrder.getColumnOrderName() == ColumnOrderName.IEEE_754_TOTAL_ORDER + ? PrimitiveComparator.DOUBLE_IEEE_754_TOTAL_ORDER_COMPARATOR + : PrimitiveComparator.DOUBLE_COMPARATOR; } }, INT96("getBinary", Binary.class) { @@ -380,7 +384,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; } }, @@ -406,7 +410,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder) { if (logicalType == null) { return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; } @@ -433,7 +437,11 @@ public Optional visit(UUIDLogicalTypeAnnotation uuidLogical @Override public Optional visit( LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { - return of(PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR); + return (columnOrder != null + && columnOrder.getColumnOrderName() + == ColumnOrderName.IEEE_754_TOTAL_ORDER) + ? of(PrimitiveComparator.BINARY_AS_FLOAT16_IEEE_754_TOTAL_ORDER_COMPARATOR) + : of(PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR); } @Override @@ -477,7 +485,11 @@ public abstract void addValueToPrimitiveConverter( public abstract T convert(PrimitiveTypeNameConverter converter) throws E; - abstract PrimitiveComparator comparator(LogicalTypeAnnotation logicalType); + abstract PrimitiveComparator comparator(LogicalTypeAnnotation logicalType, ColumnOrder columnOrder); + + public PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { + return comparator(logicalType, ColumnOrder.typeDefined()); + } } private final PrimitiveTypeName primitive; @@ -569,6 +581,12 @@ public PrimitiveType( columnOrder = primitive == PrimitiveTypeName.INT96 || originalType == OriginalType.INTERVAL ? ColumnOrder.undefined() : ColumnOrder.typeDefined(); + } else if (columnOrder.getColumnOrderName() == ColumnOrderName.IEEE_754_TOTAL_ORDER) { + Preconditions.checkArgument( + primitive == PrimitiveTypeName.FLOAT || primitive == PrimitiveTypeName.DOUBLE, + "The column order %s is not supported by type %s", + columnOrder, + primitive); } this.columnOrder = requireValidColumnOrder(columnOrder); } @@ -615,6 +633,17 @@ public PrimitiveType( || logicalTypeAnnotation instanceof LogicalTypeAnnotation.IntervalLogicalTypeAnnotation ? ColumnOrder.undefined() : ColumnOrder.typeDefined(); + } else if (columnOrder.getColumnOrderName() == ColumnOrderName.IEEE_754_TOTAL_ORDER) { + Preconditions.checkArgument( + primitive == PrimitiveTypeName.FLOAT + || primitive == PrimitiveTypeName.DOUBLE + || (logicalTypeAnnotation != null + && logicalTypeAnnotation.getType() + == LogicalTypeAnnotation.LogicalTypeToken.FLOAT16), + "The column order %s is not supported by type %s logical type %s", + columnOrder, + primitive, + logicalTypeAnnotation); } this.columnOrder = requireValidColumnOrder(columnOrder); } @@ -655,6 +684,15 @@ public PrimitiveType withLogicalTypeAnnotation(LogicalTypeAnnotation logicalType return new PrimitiveType(getRepetition(), primitive, length, getName(), logicalType, getId()); } + /** + * @param columnOrder the column order + * @return the same type with the column order set + */ + public Type withColumnOrder(ColumnOrder columnOrder) { + return new PrimitiveType( + getRepetition(), primitive, length, getName(), getLogicalTypeAnnotation(), getId(), columnOrder); + } + /** * @return the primitive type */ @@ -869,7 +907,7 @@ protected Type union(Type toMerge, boolean strict) { */ @SuppressWarnings("unchecked") public PrimitiveComparator comparator() { - return (PrimitiveComparator) getPrimitiveTypeName().comparator(getLogicalTypeAnnotation()); + return (PrimitiveComparator) getPrimitiveTypeName().comparator(getLogicalTypeAnnotation(), columnOrder()); } /** diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatisticsNanCount.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatisticsNanCount.java new file mode 100644 index 0000000000..e39b984dc5 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatisticsNanCount.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +public class TestStatisticsNanCount { + + private static final PrimitiveType FLOAT_TYPE = + Types.optional(PrimitiveTypeName.FLOAT).named("test_float"); + private static final PrimitiveType DOUBLE_TYPE = + Types.optional(PrimitiveTypeName.DOUBLE).named("test_double"); + private static final PrimitiveType FLOAT16_TYPE = Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(LogicalTypeAnnotation.float16Type()) + .named("test_float16"); + + private static final PrimitiveType FLOAT_IEEE754_TYPE = Types.optional(PrimitiveTypeName.FLOAT) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_float_ieee754"); + private static final PrimitiveType DOUBLE_IEEE754_TYPE = Types.optional(PrimitiveTypeName.DOUBLE) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_double_ieee754"); + private static final PrimitiveType FLOAT16_IEEE754_TYPE = Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(LogicalTypeAnnotation.float16Type()) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_float16_ieee754"); + + private static Binary float16Binary(short h) { + return Binary.fromConstantByteArray(new byte[] {(byte) (h & 0xFF), (byte) ((h >> 8) & 0xFF)}); + } + + private static final Binary FLOAT16_NAN = float16Binary((short) 0x7e00); + private static final Binary FLOAT16_ONE = float16Binary((short) 0x3C00); + private static final Binary FLOAT16_TWO = float16Binary((short) 0x4000); + + @Test + public void testFloatNanCountMixedValues() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats.updateStats(1.0f); + stats.updateStats(Float.NaN); + stats.updateStats(2.0f); + stats.updateStats(Float.NaN); + stats.updateStats(3.0f); + + assertTrue(stats.isNanCountSet()); + assertEquals(2, stats.getNanCount()); + assertTrue(Float.isNaN(stats.getMax()) || Float.isNaN(stats.getMin())); + } + + @Test + public void testFloatNanCountAllNaN() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats.updateStats(Float.NaN); + stats.updateStats(Float.NaN); + + assertTrue(stats.isNanCountSet()); + assertEquals(2, stats.getNanCount()); + assertTrue(stats.hasNonNullValue()); + } + + @Test + public void testFloatNanCountNoNaN() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats.updateStats(1.0f); + stats.updateStats(2.0f); + + assertTrue(stats.isNanCountSet()); + assertEquals(0, stats.getNanCount()); + } + + @Test + public void testDoubleNanCountMixedValues() { + DoubleStatistics stats = (DoubleStatistics) Statistics.createStats(DOUBLE_TYPE); + stats.updateStats(1.0); + stats.updateStats(Double.NaN); + stats.updateStats(2.0); + + assertTrue(stats.isNanCountSet()); + assertEquals(1, stats.getNanCount()); + assertTrue(Double.isNaN(stats.getMax()) || Double.isNaN(stats.getMin())); + } + + @Test + public void testFloat16NanCountMixedValues() { + BinaryStatistics stats = (BinaryStatistics) Statistics.createStats(FLOAT16_TYPE); + stats.updateStats(FLOAT16_ONE); + stats.updateStats(FLOAT16_NAN); + stats.updateStats(FLOAT16_TWO); + + assertTrue(stats.isNanCountSet()); + assertEquals(1, stats.getNanCount()); + assertTrue(stats.hasNonNullValue()); + assertTrue(Float16.isNaN(stats.genericGetMin().get2BytesLittleEndian()) + || Float16.isNaN(stats.genericGetMax().get2BytesLittleEndian())); + } + + @Test + public void testFloat16NanCountNoNaN() { + BinaryStatistics stats = (BinaryStatistics) Statistics.createStats(FLOAT16_TYPE); + stats.updateStats(FLOAT16_ONE); + + assertTrue(stats.isNanCountSet()); + assertEquals(0, stats.getNanCount()); + } + + @Test + public void testMergeNanCounts() { + FloatStatistics stats1 = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats1.updateStats(1.0f); + stats1.updateStats(Float.NaN); + + FloatStatistics stats2 = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats2.updateStats(2.0f); + stats2.updateStats(Float.NaN); + stats2.updateStats(Float.NaN); + + stats1.mergeStatistics(stats2); + assertEquals(3, stats1.getNanCount()); + } + + @Test + public void testCopyPreservesNanCount() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats.updateStats(1.0f); + stats.updateStats(Float.NaN); + stats.updateStats(Float.NaN); + + FloatStatistics copy = stats.copy(); + assertEquals(stats.getNanCount(), copy.getNanCount()); + assertTrue(copy.isNanCountSet()); + assertEquals(2, copy.getNanCount()); + } + + @Test + public void testFloatBuilderIEEE754KeepsNanMinMax() { + Statistics.Builder builder = Statistics.getBuilderForReading(FLOAT_IEEE754_TYPE); + byte[] nanBytes = BytesUtils.intToBytes(Float.floatToIntBits(Float.NaN)); + Statistics stats = builder.withMin(nanBytes) + .withMax(nanBytes) + .withNanCount(10) + .withNumNulls(0) + .build(); + + assertTrue(stats.hasNonNullValue()); + assertTrue(Float.isNaN(((FloatStatistics) stats).getMin())); + assertTrue(Float.isNaN(((FloatStatistics) stats).getMax())); + assertEquals(10, stats.getNanCount()); + } + + @Test + public void testFloatBuilderTypeDefinedDropsNanMinMax() { + Statistics.Builder builder = Statistics.getBuilderForReading(FLOAT_TYPE); + byte[] nanBytes = BytesUtils.intToBytes(Float.floatToIntBits(Float.NaN)); + Statistics stats = + builder.withMin(nanBytes).withMax(nanBytes).withNumNulls(0).build(); + + assertFalse(stats.hasNonNullValue()); + assertFalse(Float.isNaN(((FloatStatistics) stats).getMin())); + assertFalse(Float.isNaN(((FloatStatistics) stats).getMax())); + } + + @Test + public void testDoubleBuilderIEEE754KeepsNanMinMax() { + Statistics.Builder builder = Statistics.getBuilderForReading(DOUBLE_IEEE754_TYPE); + byte[] nanBytes = BytesUtils.longToBytes(Double.doubleToLongBits(Double.NaN)); + Statistics stats = builder.withMin(nanBytes) + .withMax(nanBytes) + .withNanCount(10) + .withNumNulls(0) + .build(); + + assertTrue(stats.hasNonNullValue()); + assertTrue(Double.isNaN(((DoubleStatistics) stats).getMin())); + assertTrue(Double.isNaN(((DoubleStatistics) stats).getMax())); + assertEquals(10, stats.getNanCount()); + } + + @Test + public void testFloatIEEE754NanOnlySetHasNonNullValue() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_IEEE754_TYPE); + stats.updateStats(Float.NaN); + stats.updateStats(Float.NaN); + + assertTrue(stats.hasNonNullValue()); + assertEquals(2, stats.getNanCount()); + assertTrue(Float.isNaN(stats.getMin())); + assertTrue(Float.isNaN(stats.getMax())); + } + + @Test + public void testDoubleIEEE754NanOnlySetHasNonNullValue() { + DoubleStatistics stats = (DoubleStatistics) Statistics.createStats(DOUBLE_IEEE754_TYPE); + stats.updateStats(Double.NaN); + + assertTrue(stats.hasNonNullValue()); + assertEquals(1, stats.getNanCount()); + assertTrue(Double.isNaN(stats.getMin())); + assertTrue(Double.isNaN(stats.getMax())); + } + + @Test + public void testFloat16IEEE754NanOnlySetHasNonNullValue() { + BinaryStatistics stats = (BinaryStatistics) Statistics.createStats(FLOAT16_IEEE754_TYPE); + stats.updateStats(FLOAT16_NAN); + + assertTrue(stats.hasNonNullValue()); + assertEquals(1, stats.getNanCount()); + assertTrue(Float16.isNaN(stats.genericGetMin().get2BytesLittleEndian())); + assertTrue(Float16.isNaN(stats.genericGetMax().get2BytesLittleEndian())); + } + + @Test + public void testFloatIEEE754NanExcludedFromMax() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_IEEE754_TYPE); + stats.updateStats(1.0f); + stats.updateStats(Float.NaN); + stats.updateStats(2.0f); + + // NaN is excluded from min/max on the write path for all column orders + assertEquals(2.0f, stats.getMax(), 0.0f); + assertEquals(1.0f, stats.getMin(), 0.0f); + assertEquals(1, stats.getNanCount()); + } + + @Test + public void testDoubleIEEE754NanExcludedFromMax() { + DoubleStatistics stats = (DoubleStatistics) Statistics.createStats(DOUBLE_IEEE754_TYPE); + stats.updateStats(1.0); + stats.updateStats(Double.NaN); + stats.updateStats(2.0); + + // NaN is excluded from min/max on the write path for all column orders + assertEquals(2.0, stats.getMax(), 0.0); + assertEquals(1.0, stats.getMin(), 0.0); + assertEquals(1, stats.getNanCount()); + } + + @Test + public void testFloatTypeDefinedNanOnlySetHasNonNullValue() { + FloatStatistics stats = (FloatStatistics) Statistics.createStats(FLOAT_TYPE); + stats.updateStats(Float.NaN); + stats.updateStats(Float.NaN); + + assertTrue(stats.hasNonNullValue()); + assertEquals(2, stats.getNanCount()); + } + + @Test + public void testDoubleTypeDefinedNanOnlySetHasNonNullValue() { + DoubleStatistics stats = (DoubleStatistics) Statistics.createStats(DOUBLE_TYPE); + stats.updateStats(Double.NaN); + + assertTrue(stats.hasNonNullValue()); + assertEquals(1, stats.getNanCount()); + } + + @Test + public void testFloat16TypeDefinedNanOnlySetHasNonNullValue() { + BinaryStatistics stats = (BinaryStatistics) Statistics.createStats(FLOAT16_TYPE); + stats.updateStats(FLOAT16_NAN); + + assertTrue(stats.hasNonNullValue()); + assertEquals(1, stats.getNanCount()); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilderNaN.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilderNaN.java new file mode 100644 index 0000000000..8f8bfdf448 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilderNaN.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.PrimitiveIterator; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.filter2.predicate.FilterApi; +import org.apache.parquet.filter2.predicate.Operators.BinaryColumn; +import org.apache.parquet.filter2.predicate.Operators.DoubleColumn; +import org.apache.parquet.filter2.predicate.Operators.FloatColumn; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +/** + * Tests for ColumnIndex NaN handling under IEEE_754_TOTAL_ORDER and TYPE_DEFINED_ORDER. + */ +public class TestColumnIndexBuilderNaN { + + private static final PrimitiveType FLOAT_TYPE = + Types.required(PrimitiveTypeName.FLOAT).named("test_float"); + private static final PrimitiveType FLOAT_IEEE754_TYPE = Types.required(PrimitiveTypeName.FLOAT) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_float_ieee754"); + private static final PrimitiveType DOUBLE_TYPE = + Types.required(PrimitiveTypeName.DOUBLE).named("test_double"); + private static final PrimitiveType DOUBLE_IEEE754_TYPE = Types.required(PrimitiveTypeName.DOUBLE) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_double_ieee754"); + private static final PrimitiveType FLOAT16_TYPE = Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(LogicalTypeAnnotation.float16Type()) + .named("test_float16"); + private static final PrimitiveType FLOAT16_IEEE754_TYPE = Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(LogicalTypeAnnotation.float16Type()) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_float16_ieee754"); + + private static final FloatColumn FLOAT_COL = FilterApi.floatColumn("test_float_ieee754"); + private static final DoubleColumn DOUBLE_COL = FilterApi.doubleColumn("test_double_ieee754"); + private static final BinaryColumn FLOAT16_COL = FilterApi.binaryColumn("test_float16_ieee754"); + + private static Binary float16Binary(short h) { + return Binary.fromConstantByteArray(new byte[] {(byte) (h & 0xFF), (byte) ((h >> 8) & 0xFF)}); + } + + private static final Binary FLOAT16_NAN = float16Binary((short) 0x7e00); + private static final Binary FLOAT16_ONE = float16Binary((short) 0x3C00); // 1.0 + private static final Binary FLOAT16_TWO = float16Binary((short) 0x4000); // 2.0 + private static final Binary FLOAT16_THREE = float16Binary((short) 0x4200); // 3.0 + private static final Binary FLOAT16_FOUR = float16Binary((short) 0x4400); // 4.0 + + private static Statistics floatStats(PrimitiveType type, float... values) { + Statistics stats = Statistics.createStats(type); + for (float value : values) { + stats.updateStats(value); + } + return stats; + } + + private static Statistics doubleStats(PrimitiveType type, double... values) { + Statistics stats = Statistics.createStats(type); + for (double value : values) { + stats.updateStats(value); + } + return stats; + } + + private static Statistics binaryStats(PrimitiveType type, Binary... values) { + Statistics stats = Statistics.createStats(type); + for (Binary value : values) { + stats.updateStats(value); + } + return stats; + } + + private static List toList(PrimitiveIterator.OfInt iter) { + List result = new ArrayList<>(); + iter.forEachRemaining((int i) -> result.add(i)); + return result; + } + + // TYPE_DEFINED_ORDER: build column index with NaN + + @Test + public void testFloatTypeDefinedOrderNaN() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT_TYPE, Integer.MAX_VALUE); + builder.add(floatStats(FLOAT_TYPE, 1.0f, 2.0f)); + builder.add(floatStats(FLOAT_TYPE, Float.NaN)); + builder.add(floatStats(FLOAT_TYPE, 3.0f, 4.0f)); + assertNull(builder.build()); + } + + @Test + public void testDoubleTypeDefinedOrderNaN() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(DOUBLE_TYPE, Integer.MAX_VALUE); + builder.add(doubleStats(DOUBLE_TYPE, 1.0, 2.0)); + builder.add(doubleStats(DOUBLE_TYPE, Double.NaN)); + builder.add(doubleStats(DOUBLE_TYPE, 3.0, 4.0)); + assertNull(builder.build()); + } + + @Test + public void testFloat16TypeDefinedOrderNaN() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT16_TYPE, Integer.MAX_VALUE); + builder.add(binaryStats(FLOAT16_TYPE, FLOAT16_ONE, FLOAT16_TWO)); + builder.add(binaryStats(FLOAT16_TYPE, FLOAT16_NAN)); + builder.add(binaryStats(FLOAT16_TYPE, FLOAT16_ONE)); + assertNull(builder.build()); + } + + // IEEE_754_TOTAL_ORDER: build column index with NaN + + @Test + public void testFloatIeee754BuildNanCounts() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(floatStats(FLOAT_IEEE754_TYPE, 1.0f, 2.0f)); + builder.add(floatStats(FLOAT_IEEE754_TYPE, Float.NaN, Float.NaN)); + builder.add(floatStats(FLOAT_IEEE754_TYPE, 3.0f, Float.NaN, 4.0f)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + assertEquals(List.of(0L, 2L, 1L), ci.getNanCounts()); + } + + @Test + public void testDoubleIeee754BuildNanCounts() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(DOUBLE_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, 1.0, 2.0)); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, Double.NaN, Double.NaN, Double.NaN)); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, 3.0, Double.NaN, 4.0)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + assertEquals(List.of(0L, 3L, 1L), ci.getNanCounts()); + } + + @Test + public void testFloat16Ieee754BuildNanCounts() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT16_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_ONE, FLOAT16_NAN, FLOAT16_TWO)); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_NAN, FLOAT16_NAN)); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_ONE)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + assertEquals(List.of(1L, 2L, 0L), ci.getNanCounts()); + } + + // Column index filtering for float + + @Test + public void testNaNFloatZeroNaN() { + // Pages: [1.0, 2.0], [3.0, 4.0] + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(floatStats(FLOAT_IEEE754_TYPE, 1.0f, 2.0f)); + builder.add(floatStats(FLOAT_IEEE754_TYPE, 3.0f, 4.0f)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + + // Non-NaN literal (1.5 within page 0 range; ASCENDING boundary order) + assertEquals(List.of(0), toList(ci.visit(FilterApi.eq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.notEq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.lt(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.ltEq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gt(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gtEq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.in(FLOAT_COL, new HashSet<>(List.of(1.5f)))))); + + // NaN literal: nanCounts all zero → eq returns empty, notEq returns all + assertEquals(List.of(), toList(ci.visit(FilterApi.eq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.notEq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.lt(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.ltEq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gt(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gtEq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(), toList(ci.visit(FilterApi.in(FLOAT_COL, new HashSet<>(List.of(Float.NaN)))))); + } + + @Test + public void testNaNFloatMixed() { + // Pages: [1.0, 2.0], [NaN, NaN], [3.0, 4.0] + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(floatStats(FLOAT_IEEE754_TYPE, 1.0f, 2.0f)); + builder.add(floatStats(FLOAT_IEEE754_TYPE, Float.NaN, Float.NaN)); + builder.add(floatStats(FLOAT_IEEE754_TYPE, 3.0f, 4.0f)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + + assertEquals(List.of(0), toList(ci.visit(FilterApi.eq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.notEq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.lt(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.ltEq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gtEq(FLOAT_COL, 1.5f)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.in(FLOAT_COL, new HashSet<>(List.of(1.5f)))))); + + assertEquals(List.of(1), toList(ci.visit(FilterApi.eq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 2), toList(ci.visit(FilterApi.notEq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.lt(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.ltEq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gtEq(FLOAT_COL, Float.NaN)))); + assertEquals(List.of(1), toList(ci.visit(FilterApi.in(FLOAT_COL, new HashSet<>(List.of(Float.NaN)))))); + } + + // Column index filtering for double + + @Test + public void testNaNDoubleZeroNaN() { + // Pages: [1.0, 2.0], [3.0, 4.0] + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(DOUBLE_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, 1.0, 2.0)); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, 3.0, 4.0)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + + assertEquals(List.of(0), toList(ci.visit(FilterApi.eq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.notEq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.lt(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.ltEq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gt(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gtEq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.in(DOUBLE_COL, new HashSet<>(List.of(1.5)))))); + + assertEquals(List.of(), toList(ci.visit(FilterApi.eq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.notEq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.lt(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.ltEq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gt(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gtEq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(), toList(ci.visit(FilterApi.in(DOUBLE_COL, new HashSet<>(List.of(Double.NaN)))))); + } + + @Test + public void testNaNDoubleMixed() { + // Pages: [1.0, 2.0], [NaN, NaN], [3.0, 4.0] + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(DOUBLE_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, 1.0, 2.0)); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, Double.NaN, Double.NaN)); + builder.add(doubleStats(DOUBLE_IEEE754_TYPE, 3.0, 4.0)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + + assertEquals(List.of(0), toList(ci.visit(FilterApi.eq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.notEq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.lt(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.ltEq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gtEq(DOUBLE_COL, 1.5)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.in(DOUBLE_COL, new HashSet<>(List.of(1.5)))))); + + assertEquals(List.of(1), toList(ci.visit(FilterApi.eq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 2), toList(ci.visit(FilterApi.notEq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.lt(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.ltEq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gtEq(DOUBLE_COL, Double.NaN)))); + assertEquals(List.of(1), toList(ci.visit(FilterApi.in(DOUBLE_COL, new HashSet<>(List.of(Double.NaN)))))); + } + + // Column index filtering for float16 + + @Test + public void testNaNFloat16ZeroNaN() { + // Pages: [1.0, 2.0], [3.0, 4.0] + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT16_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_ONE, FLOAT16_TWO)); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_THREE, FLOAT16_FOUR)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + + assertEquals(List.of(0), toList(ci.visit(FilterApi.eq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.notEq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(), toList(ci.visit(FilterApi.lt(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.ltEq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gt(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gtEq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.in(FLOAT16_COL, new HashSet<>(List.of(FLOAT16_ONE)))))); + + assertEquals(List.of(), toList(ci.visit(FilterApi.eq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.notEq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.lt(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.ltEq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gt(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1), toList(ci.visit(FilterApi.gtEq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(), toList(ci.visit(FilterApi.in(FLOAT16_COL, new HashSet<>(List.of(FLOAT16_NAN)))))); + } + + @Test + public void testNaNFloat16Mixed() { + // Pages: [1.0, 2.0], [NaN, NaN], [3.0, 4.0] + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(FLOAT16_IEEE754_TYPE, Integer.MAX_VALUE); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_ONE, FLOAT16_TWO)); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_NAN, FLOAT16_NAN)); + builder.add(binaryStats(FLOAT16_IEEE754_TYPE, FLOAT16_THREE, FLOAT16_FOUR)); + ColumnIndex ci = builder.build(); + assertNotNull(ci); + + assertEquals(List.of(0), toList(ci.visit(FilterApi.eq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.notEq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(), toList(ci.visit(FilterApi.lt(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.ltEq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gtEq(FLOAT16_COL, FLOAT16_ONE)))); + assertEquals(List.of(0), toList(ci.visit(FilterApi.in(FLOAT16_COL, new HashSet<>(List.of(FLOAT16_ONE)))))); + + assertEquals(List.of(1), toList(ci.visit(FilterApi.eq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 2), toList(ci.visit(FilterApi.notEq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.lt(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.ltEq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gtEq(FLOAT16_COL, FLOAT16_NAN)))); + assertEquals(List.of(1), toList(ci.visit(FilterApi.in(FLOAT16_COL, new HashSet<>(List.of(FLOAT16_NAN)))))); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 8fb53aca0f..ec7425141e 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -19,10 +19,13 @@ package org.apache.parquet.schema; import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_FLOAT16_IEEE_754_TOTAL_ORDER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_IEEE_754_TOTAL_ORDER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.FLOAT_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.FLOAT_IEEE_754_TOTAL_ORDER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT32_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT64_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; @@ -167,6 +170,22 @@ private void testInt64Comparator(PrimitiveComparator comparator, Long... v checkThrowingUnsupportedException(comparator, Long.TYPE); } + private void testFloatComparator(PrimitiveComparator comparator, Float... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Float vi = valuesInAscendingOrder[i]; + Float vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, comparator.compare(vi.floatValue(), vj.floatValue())); + } + } + } + + checkThrowingUnsupportedException(comparator, Float.TYPE); + } + @Test public void testFloatComparator() { Float[] valuesInAscendingOrder = { @@ -182,19 +201,46 @@ public void testFloatComparator() { Float.POSITIVE_INFINITY }; + testFloatComparator(FLOAT_COMPARATOR, valuesInAscendingOrder); + } + + @Test + public void testFloatIEEE754TotalOrderComparator() { + Float[] valuesInAscendingOrder = { + null, + Float.intBitsToFloat(0xFFFFFFFF), // -NaN (smallest) + Float.intBitsToFloat(0xFFF00001), // -NaN (largest) + Float.NEGATIVE_INFINITY, + -Float.MAX_VALUE, + -1234.5678F, + -Float.MIN_VALUE, + -0.0F, + 0.0F, + Float.MIN_VALUE, + 1234.5678F, + Float.MAX_VALUE, + Float.POSITIVE_INFINITY, + Float.intBitsToFloat(0x7FF00001), // +NaN (smallest) + Float.intBitsToFloat(0x7FFFFFFF), // +NaN (largest) + }; + + testFloatComparator(FLOAT_IEEE_754_TOTAL_ORDER_COMPARATOR, valuesInAscendingOrder); + } + + private void testDoubleComparator(PrimitiveComparator comparator, Double... valuesInAscendingOrder) { for (int i = 0; i < valuesInAscendingOrder.length; ++i) { for (int j = 0; j < valuesInAscendingOrder.length; ++j) { - Float vi = valuesInAscendingOrder[i]; - Float vj = valuesInAscendingOrder[j]; + Double vi = valuesInAscendingOrder[i]; + Double vj = valuesInAscendingOrder[j]; int exp = i - j; - assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi, vj)); + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); if (vi != null && vj != null) { - assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi.floatValue(), vj.floatValue())); + assertSignumEquals(vi, vj, exp, comparator.compare(vi.doubleValue(), vj.doubleValue())); } } } - checkThrowingUnsupportedException(FLOAT_COMPARATOR, Float.TYPE); + checkThrowingUnsupportedException(comparator, Double.TYPE); } @Test @@ -212,19 +258,30 @@ public void testDoubleComparator() { Double.POSITIVE_INFINITY }; - for (int i = 0; i < valuesInAscendingOrder.length; ++i) { - for (int j = 0; j < valuesInAscendingOrder.length; ++j) { - Double vi = valuesInAscendingOrder[i]; - Double vj = valuesInAscendingOrder[j]; - int exp = i - j; - assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi, vj)); - if (vi != null && vj != null) { - assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi.doubleValue(), vj.doubleValue())); - } - } - } + testDoubleComparator(DOUBLE_COMPARATOR, valuesInAscendingOrder); + } - checkThrowingUnsupportedException(DOUBLE_COMPARATOR, Double.TYPE); + @Test + public void testDoubleIEEE754TotalOrderComparator() { + Double[] valuesInAscendingOrder = { + null, + Double.longBitsToDouble(0xFFFFFFFFFFFFFFFFL), // -NaN (smallest) + Double.longBitsToDouble(0xFFF0000000000001L), // -NaN (largest) + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -123456.7890123456789, + -Double.MIN_VALUE, + -0.0, + +0.0, + Double.MIN_VALUE, + 123456.7890123456789, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY, + Double.longBitsToDouble(0x7FF0000000000001L), // +NaN (smallest) + Double.longBitsToDouble(0x7FFFFFFFFFFFFFFFL), // +NaN (largest) + }; + + testDoubleComparator(DOUBLE_IEEE_754_TOTAL_ORDER_COMPARATOR, valuesInAscendingOrder); } @Test @@ -324,6 +381,34 @@ public void testFloat16Comparator() { } } + @Test + public void testBinaryAsFloat16IEEE754TotalOrderComparator() { + Binary[] valuesInAscendingOrder = { + null, + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0xff}), // -NaN (smallest) + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0xfc}), // -NaN (largest) + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x84}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {0x00, 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}), // Infinity + Binary.fromConstantByteArray(new byte[] {0x01, 0x7c}), // +NaN (smallest) + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, 0x7f}) // +NaN (largest) + }; + + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Binary vi = valuesInAscendingOrder[i]; + Binary vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, BINARY_AS_FLOAT16_IEEE_754_TOTAL_ORDER_COMPARATOR.compare(vi, vj)); + } + } + } + private void testObjectComparator(PrimitiveComparator comparator, T... valuesInAscendingOrder) { for (int i = 0; i < valuesInAscendingOrder.length; ++i) { for (int j = 0; j < valuesInAscendingOrder.length; ++j) { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index fb7e0badec..41a10609ce 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -44,6 +44,11 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; /** * Applies a {@link org.apache.parquet.filter2.predicate.FilterPredicate} to statistics about a group of @@ -99,6 +104,102 @@ private boolean hasNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() > 0; } + private static boolean isFloatingPointColumn(ColumnChunkMetaData column) { + PrimitiveType type = column.getPrimitiveType(); + PrimitiveTypeName typeName = type.getPrimitiveTypeName(); + return typeName == PrimitiveTypeName.FLOAT + || typeName == PrimitiveTypeName.DOUBLE + || (typeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY + && type.getLogicalTypeAnnotation() + instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation); + } + + // check if all non-null values are NaN (nan_count + null_count == value_count) + private boolean isAllNaNs(ColumnChunkMetaData column) { + if (!isFloatingPointColumn(column)) { + return false; + } + Statistics stats = column.getStatistics(); + return stats.isNanCountSet() + && stats.isNumNullsSet() + && stats.getNanCount() > 0 + && stats.getNanCount() + stats.getNumNulls() == column.getValueCount(); + } + + private static boolean isNaNLiteral(ColumnChunkMetaData column, Object value) { + if (!isFloatingPointColumn(column)) { + return false; + } + PrimitiveTypeName typeName = column.getPrimitiveType().getPrimitiveTypeName(); + if (typeName == PrimitiveTypeName.FLOAT) { + return Float.isNaN((Float) value); + } + if (typeName == PrimitiveTypeName.DOUBLE) { + return Double.isNaN((Double) value); + } + // Float16: value is Binary, must be exactly 2 bytes + if (!(value instanceof Binary)) { + return false; + } + Binary b = (Binary) value; + if (b.length() != 2) { + return false; + } + return Float16.isNaN(b.get2BytesLittleEndian()); + } + + // check if any of min or max value is NaN + private static boolean hasNaNMinMax(ColumnChunkMetaData column, Statistics stats) { + if (!isFloatingPointColumn(column) || !stats.hasNonNullValue()) { + return false; + } + PrimitiveTypeName typeName = column.getPrimitiveType().getPrimitiveTypeName(); + if (typeName == PrimitiveTypeName.FLOAT) { + return Float.isNaN((Float) stats.genericGetMin()) || Float.isNaN((Float) stats.genericGetMax()); + } + if (typeName == PrimitiveTypeName.DOUBLE) { + return Double.isNaN((Double) stats.genericGetMin()) || Double.isNaN((Double) stats.genericGetMax()); + } + // Float16 + Object minVal = stats.genericGetMin(); + Object maxVal = stats.genericGetMax(); + if (minVal instanceof Binary && maxVal instanceof Binary) { + Binary bMin = (Binary) minVal; + Binary bMax = (Binary) maxVal; + return (bMin.length() == 2 && Float16.isNaN(bMin.get2BytesLittleEndian())) + || (bMax.length() == 2 && Float16.isNaN(bMax.get2BytesLittleEndian())); + } + return false; + } + + private static boolean hasNaNLiteral(ColumnChunkMetaData column, Set values) { + PrimitiveTypeName typeName = column.getPrimitiveType().getPrimitiveTypeName(); + boolean isFloat = typeName == PrimitiveTypeName.FLOAT; + boolean isDouble = typeName == PrimitiveTypeName.DOUBLE; + boolean isFloat16 = typeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY + && column.getPrimitiveType().getLogicalTypeAnnotation() + instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation; + if (!isFloat && !isDouble && !isFloat16) { + return false; + } + for (T v : values) { + if (v == null) { + continue; + } + if (isFloat) { + if (Float.isNaN((Float) v)) return true; + } else if (isDouble) { + if (Double.isNaN((Double) v)) return true; + } else if (v instanceof Binary) { + Binary b = (Binary) v; + if (b.length() == 2 && Float16.isNaN(b.get2BytesLittleEndian())) { + return true; + } + } + } + return false; + } + @Override @SuppressWarnings("unchecked") public > Boolean visit(Eq eq) { @@ -139,7 +240,15 @@ public > Boolean visit(Eq eq) { return BLOCK_CANNOT_MATCH; } - if (!stats.hasNonNullValue()) { + if (isNaNLiteral(meta, value)) { + return (!stats.isNanCountSet() || stats.getNanCount() > 0) ? BLOCK_MIGHT_MATCH : BLOCK_CANNOT_MATCH; + } + + if (isAllNaNs(meta)) { + return BLOCK_CANNOT_MATCH; + } + + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } @@ -182,11 +291,6 @@ public > Boolean visit(In in) { } } - if (!stats.hasNonNullValue()) { - // stats does not contain min/max values, we cannot drop any chunks - return BLOCK_MIGHT_MATCH; - } - if (stats.isNumNullsSet()) { if (stats.getNumNulls() == 0) { if (values.contains(null) && values.size() == 1) return BLOCK_CANNOT_MATCH; @@ -195,6 +299,20 @@ public > Boolean visit(In in) { } } + // If any value in the IN set is NaN, be conservative + if (hasNaNLiteral(meta, values)) { + return BLOCK_MIGHT_MATCH; + } + + if (isAllNaNs(meta)) { + return (values.contains(null) && hasNulls(meta)) ? BLOCK_MIGHT_MATCH : BLOCK_CANNOT_MATCH; + } + + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + MinMax minMax = new MinMax(meta.getPrimitiveType().comparator(), values); T min = minMax.getMin(); T max = minMax.getMax(); @@ -252,7 +370,15 @@ public > Boolean visit(NotEq notEq) { return BLOCK_MIGHT_MATCH; } - if (!stats.hasNonNullValue()) { + if (isNaNLiteral(meta, value)) { + return (stats.isNanCountSet() && stats.getNanCount() == 0) ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH; + } + + if (isAllNaNs(meta)) { + return BLOCK_MIGHT_MATCH; + } + + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } @@ -286,13 +412,17 @@ public > Boolean visit(Lt lt) { return BLOCK_CANNOT_MATCH; } - if (!stats.hasNonNullValue()) { + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = lt.getValue(); + if (isNaNLiteral(meta, value)) { + return BLOCK_MIGHT_MATCH; + } + // drop if value <= min return stats.compareMinToValue(value) >= 0; } @@ -322,13 +452,17 @@ public > Boolean visit(LtEq ltEq) { return BLOCK_CANNOT_MATCH; } - if (!stats.hasNonNullValue()) { + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = ltEq.getValue(); + if (isNaNLiteral(meta, value)) { + return BLOCK_MIGHT_MATCH; + } + // drop if value < min return stats.compareMinToValue(value) > 0; } @@ -358,13 +492,17 @@ public > Boolean visit(Gt gt) { return BLOCK_CANNOT_MATCH; } - if (!stats.hasNonNullValue()) { + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); + if (isNaNLiteral(meta, value)) { + return BLOCK_MIGHT_MATCH; + } + // drop if value >= max return stats.compareMaxToValue(value) <= 0; } @@ -394,13 +532,17 @@ public > Boolean visit(GtEq gtEq) { return BLOCK_CANNOT_MATCH; } - if (!stats.hasNonNullValue()) { + if (!stats.hasNonNullValue() || hasNaNMinMax(meta, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gtEq.getValue(); + if (isNaNLiteral(meta, value)) { + return BLOCK_MIGHT_MATCH; + } + // drop if value > max return stats.compareMaxToValue(value) < 0; } @@ -462,7 +604,7 @@ private , U extends UserDefinedPredicate> Boolean vis } } - if (!stats.hasNonNullValue()) { + if (!stats.hasNonNullValue() || hasNaNMinMax(columnChunk, stats)) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 60150439a6..03909a7ef5 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -47,6 +47,7 @@ import org.apache.parquet.CorruptStatistics; import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.Preconditions; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.statistics.BinaryStatistics; @@ -86,6 +87,7 @@ import org.apache.parquet.format.GeographyType; import org.apache.parquet.format.GeometryType; import org.apache.parquet.format.GeospatialStatistics; +import org.apache.parquet.format.IEEE754TotalOrder; import org.apache.parquet.format.IntType; import org.apache.parquet.format.KeyValue; import org.apache.parquet.format.LogicalType; @@ -143,6 +145,7 @@ public class ParquetMetadataConverter { private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder(); + private static final IEEE754TotalOrder IEEE_754_TOTAL_ORDER = new IEEE754TotalOrder(); public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k @@ -278,11 +281,23 @@ public FileMetaData toParquetMetadata( private List getColumnOrders(MessageType schema) { List columnOrders = new ArrayList<>(); - // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with - // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. - for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { + for (ColumnDescriptor column : schema.getColumns()) { ColumnOrder columnOrder = new ColumnOrder(); - columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); + switch (column.getPrimitiveType().columnOrder().getColumnOrderName()) { + case TYPE_DEFINED_ORDER: + columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); + break; + case IEEE_754_TOTAL_ORDER: + columnOrder.setIEEE_754_TOTAL_ORDER(IEEE_754_TOTAL_ORDER); + break; + case UNDEFINED: + // Use TypeDefinedOrder if some types (e.g. INT96) have undefined column orders. + columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); + break; + default: + throw new IllegalArgumentException( + "Unknown column order: " + column.getPrimitiveType().columnOrder()); + } columnOrders.add(columnOrder); } return columnOrders; @@ -804,6 +819,9 @@ public static Statistics toParquetStatistics( // value has been truncated and is a lower bound and not in the page. if (!stats.isEmpty() && withinLimit(stats, truncateLength)) { formatStats.setNull_count(stats.getNumNulls()); + if (stats.isNanCountSet()) { + formatStats.setNan_count(stats.getNanCount()); + } if (stats.hasNonNullValue()) { byte[] min; byte[] max; @@ -889,7 +907,8 @@ private static byte[] tuncateMax(BinaryTruncator truncator, int truncateLength, } private static boolean isMinMaxStatsSupported(PrimitiveType type) { - return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER; + return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER + || type.columnOrder().getColumnOrderName() == ColumnOrderName.IEEE_754_TOTAL_ORDER; } /** @@ -958,6 +977,9 @@ static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInte if (formatStats.isSetNull_count()) { statsBuilder.withNumNulls(formatStats.null_count); } + if (formatStats.isSetNan_count()) { + statsBuilder.withNanCount(formatStats.getNan_count()); + } } return statsBuilder.build(); } @@ -2088,6 +2110,9 @@ private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(Colu if (columnOrder.isSetTYPE_ORDER()) { return org.apache.parquet.schema.ColumnOrder.typeDefined(); } + if (columnOrder.isSetIEEE_754_TOTAL_ORDER()) { + return org.apache.parquet.schema.ColumnOrder.ieee754TotalOrder(); + } // The column order is not yet supported by this API return org.apache.parquet.schema.ColumnOrder.undefined(); } @@ -2547,6 +2572,10 @@ public static ColumnIndex toParquetColumnIndex( columnIndex.getMaxValues(), toParquetBoundaryOrder(columnIndex.getBoundaryOrder())); parquetColumnIndex.setNull_counts(columnIndex.getNullCounts()); + List nanCounts = columnIndex.getNanCounts(); + if (nanCounts != null && !nanCounts.isEmpty()) { + parquetColumnIndex.setNan_counts(nanCounts); + } List repLevelHistogram = columnIndex.getRepetitionLevelHistogram(); if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) { parquetColumnIndex.setRepetition_level_histograms(repLevelHistogram); @@ -2568,6 +2597,7 @@ public static org.apache.parquet.internal.column.columnindex.ColumnIndex fromPar fromParquetBoundaryOrder(parquetColumnIndex.getBoundary_order()), parquetColumnIndex.getNull_pages(), parquetColumnIndex.getNull_counts(), + parquetColumnIndex.getNan_counts(), parquetColumnIndex.getMin_values(), parquetColumnIndex.getMax_values(), parquetColumnIndex.getRepetition_level_histograms(), diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index 6e1f267e8c..714507a248 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -23,6 +23,7 @@ import static org.apache.parquet.filter2.predicate.FilterApi.contains; import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.floatColumn; import static org.apache.parquet.filter2.predicate.FilterApi.gt; import static org.apache.parquet.filter2.predicate.FilterApi.gtEq; import static org.apache.parquet.filter2.predicate.FilterApi.in; @@ -46,12 +47,14 @@ import java.util.Set; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.statistics.DoubleStatistics; +import org.apache.parquet.column.statistics.FloatStatistics; import org.apache.parquet.column.statistics.IntStatistics; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.filter2.predicate.LogicalInverseRewriter; import org.apache.parquet.filter2.predicate.Operators; import org.apache.parquet.filter2.predicate.Operators.BinaryColumn; import org.apache.parquet.filter2.predicate.Operators.DoubleColumn; +import org.apache.parquet.filter2.predicate.Operators.FloatColumn; import org.apache.parquet.filter2.predicate.Operators.IntColumn; import org.apache.parquet.filter2.predicate.Statistics; import org.apache.parquet.filter2.predicate.UserDefinedPredicate; @@ -59,6 +62,9 @@ import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Types; import org.junit.Test; @@ -589,4 +595,363 @@ public void testClearExceptionForNots() { e.getMessage()); } } + + private static final FloatColumn floatCol = floatColumn("float.column"); + + private static ColumnChunkMetaData getFloatColumnMeta( + org.apache.parquet.column.statistics.Statistics stats, long valueCount) { + return ColumnChunkMetaData.get( + ColumnPath.get("float", "column"), + PrimitiveTypeName.FLOAT, + CompressionCodecName.GZIP, + new HashSet<>(List.of(Encoding.PLAIN)), + stats, + 0L, + 0L, + valueCount, + 0L, + 0L); + } + + private static ColumnChunkMetaData getDoubleColumnMetaWithType( + PrimitiveType type, org.apache.parquet.column.statistics.Statistics stats, long valueCount) { + return ColumnChunkMetaData.get( + ColumnPath.get("double", "column"), + type, + CompressionCodecName.GZIP, + null, + new HashSet<>(List.of(Encoding.PLAIN)), + stats, + 0L, + 0L, + valueCount, + 0L, + 0L); + } + + // ========================= Double NaN Tests ========================= + + @Test + public void testNaNDoubleAllNaN() { + // All non-null values are NaN, TYPE_DEFINED_ORDER (no min/max set) + DoubleStatistics allNanStats = new DoubleStatistics(); + allNanStats.setNumNulls(0); + allNanStats.incrementNanCount(177); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getDoubleColumnMeta(allNanStats, 177L)); + + assertTrue(canDrop(eq(doubleColumn, 5.0), metas)); + assertFalse(canDrop(notEq(doubleColumn, 5.0), metas)); + assertFalse(canDrop(lt(doubleColumn, 5.0), metas)); + assertFalse(canDrop(ltEq(doubleColumn, 5.0), metas)); + assertFalse(canDrop(gt(doubleColumn, 5.0), metas)); + assertFalse(canDrop(gtEq(doubleColumn, 5.0), metas)); + assertTrue(canDrop(in(doubleColumn, new HashSet<>(List.of(5.0))), metas)); + + assertFalse(canDrop(eq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(notEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(lt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(ltEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gtEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(Double.NaN))), metas)); + } + + @Test + public void testNaNDoubleMixed() { + // Mixed: nan_count=50, null_count=0, min=10, max=100, valueCount=177 + DoubleStatistics mixedStats = new DoubleStatistics(); + mixedStats.setMinMax(10, 100); + mixedStats.setNumNulls(0); + mixedStats.incrementNanCount(50); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getDoubleColumnMeta(mixedStats, 177L)); + + // Non-NaN literal within range: cannot drop + assertFalse(canDrop(eq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(notEq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(lt(doubleColumn, 50.0), metas)); + assertFalse(canDrop(ltEq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(gt(doubleColumn, 50.0), metas)); + assertFalse(canDrop(gtEq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(50.0))), metas)); + + // NaN literal: NaN values are present so cannot drop + assertFalse(canDrop(eq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(notEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(lt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(ltEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gtEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(Double.NaN))), metas)); + } + + @Test + public void testNaNDoubleZeroNaNCount() { + // Explicit zero NaN count with valid min/max + DoubleStatistics zeroNanStats = new DoubleStatistics(); + zeroNanStats.setMinMax(10, 100); + zeroNanStats.setNumNulls(0); + zeroNanStats.incrementNanCount(0); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getDoubleColumnMeta(zeroNanStats, 177L)); + + assertFalse(canDrop(eq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(notEq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(lt(doubleColumn, 50.0), metas)); + assertFalse(canDrop(ltEq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(gt(doubleColumn, 50.0), metas)); + assertFalse(canDrop(gtEq(doubleColumn, 50.0), metas)); + assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(50.0))), metas)); + + assertTrue(canDrop(eq(doubleColumn, Double.NaN), metas)); + assertTrue(canDrop(notEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(lt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(ltEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gtEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(Double.NaN))), metas)); + } + + @Test + public void testNaNDoubleIeee754TotalOrder() { + // All-NaN with IEEE_754_TOTAL_ORDER — stats built via builder (no min/max) + PrimitiveType ieee754Type = Types.required(PrimitiveTypeName.DOUBLE) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_double"); + org.apache.parquet.column.statistics.Statistics allNanStats = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(ieee754Type) + .withNumNulls(0) + .withNanCount(177) + .build(); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getDoubleColumnMetaWithType(ieee754Type, allNanStats, 177L)); + + assertTrue(canDrop(eq(doubleColumn, 5.0), metas)); + assertFalse(canDrop(notEq(doubleColumn, 5.0), metas)); + assertFalse(canDrop(lt(doubleColumn, 5.0), metas)); + assertFalse(canDrop(ltEq(doubleColumn, 5.0), metas)); + assertFalse(canDrop(gt(doubleColumn, 5.0), metas)); + assertFalse(canDrop(gtEq(doubleColumn, 5.0), metas)); + assertTrue(canDrop(in(doubleColumn, new HashSet<>(List.of(5.0))), metas)); + + assertFalse(canDrop(eq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(notEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(lt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(ltEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gt(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(gtEq(doubleColumn, Double.NaN), metas)); + assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(Double.NaN))), metas)); + } + + // ========================= Float NaN Tests ========================= + + @Test + public void testNaNFloatAllNaN() { + // All non-null values are NaN + FloatStatistics allNanStats = new FloatStatistics(); + allNanStats.setNumNulls(0); + allNanStats.incrementNanCount(177); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getFloatColumnMeta(allNanStats, 177L)); + + assertTrue(canDrop(eq(floatCol, 5.0f), metas)); + assertFalse(canDrop(notEq(floatCol, 5.0f), metas)); + assertFalse(canDrop(lt(floatCol, 5.0f), metas)); + assertFalse(canDrop(ltEq(floatCol, 5.0f), metas)); + assertFalse(canDrop(gt(floatCol, 5.0f), metas)); + assertFalse(canDrop(gtEq(floatCol, 5.0f), metas)); + assertTrue(canDrop(in(floatCol, new HashSet<>(List.of(5.0f))), metas)); + + assertFalse(canDrop(eq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(notEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(lt(floatCol, Float.NaN), metas)); + assertFalse(canDrop(ltEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(gt(floatCol, Float.NaN), metas)); + assertFalse(canDrop(gtEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(in(floatCol, new HashSet<>(List.of(Float.NaN))), metas)); + } + + @Test + public void testNaNFloatMixed() { + // Mixed: nan_count=50, min=10, max=100 + FloatStatistics mixedStats = new FloatStatistics(); + mixedStats.setMinMax(10.0f, 100.0f); + mixedStats.setNumNulls(0); + mixedStats.incrementNanCount(50); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getFloatColumnMeta(mixedStats, 177L)); + + assertFalse(canDrop(eq(floatCol, 50.0f), metas)); + assertFalse(canDrop(notEq(floatCol, 50.0f), metas)); + assertFalse(canDrop(lt(floatCol, 50.0f), metas)); + assertFalse(canDrop(ltEq(floatCol, 50.0f), metas)); + assertFalse(canDrop(gt(floatCol, 50.0f), metas)); + assertFalse(canDrop(gtEq(floatCol, 50.0f), metas)); + assertFalse(canDrop(in(floatCol, new HashSet<>(List.of(50.0f))), metas)); + + assertFalse(canDrop(eq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(notEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(lt(floatCol, Float.NaN), metas)); + assertFalse(canDrop(ltEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(gt(floatCol, Float.NaN), metas)); + assertFalse(canDrop(gtEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(in(floatCol, new HashSet<>(List.of(Float.NaN))), metas)); + } + + @Test + public void testNaNFloatZeroNaNCount() { + // Zero NaN count with valid min/max + FloatStatistics zeroNanStats = new FloatStatistics(); + zeroNanStats.setMinMax(10.0f, 100.0f); + zeroNanStats.setNumNulls(0); + zeroNanStats.incrementNanCount(0); + + List metas = + List.of(getIntColumnMeta(intStats, 177L), getFloatColumnMeta(zeroNanStats, 177L)); + + assertFalse(canDrop(eq(floatCol, 50.0f), metas)); + assertFalse(canDrop(notEq(floatCol, 50.0f), metas)); + assertFalse(canDrop(lt(floatCol, 50.0f), metas)); + assertFalse(canDrop(ltEq(floatCol, 50.0f), metas)); + assertFalse(canDrop(gt(floatCol, 50.0f), metas)); + assertFalse(canDrop(gtEq(floatCol, 50.0f), metas)); + assertFalse(canDrop(in(floatCol, new HashSet<>(List.of(50.0f))), metas)); + + assertTrue(canDrop(eq(floatCol, Float.NaN), metas)); + assertTrue(canDrop(notEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(lt(floatCol, Float.NaN), metas)); + assertFalse(canDrop(ltEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(gt(floatCol, Float.NaN), metas)); + assertFalse(canDrop(gtEq(floatCol, Float.NaN), metas)); + assertFalse(canDrop(in(floatCol, new HashSet<>(List.of(Float.NaN))), metas)); + } + + // ========================= Float16 NaN Tests ========================= + + private static final PrimitiveType FLOAT16_TYPE = Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(LogicalTypeAnnotation.float16Type()) + .named("test_float16"); + + private static final Binary FLOAT16_NAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x7e}); + private static final Binary FLOAT16_ONE = Binary.fromConstantByteArray(new byte[] {0x00, 0x3c}); + private static final Binary FLOAT16_TEN = Binary.fromConstantByteArray(new byte[] {0x00, 0x49}); + private static final Binary FLOAT16_HUNDRED = Binary.fromConstantByteArray(new byte[] {0x40, 0x56}); + private static final Binary FLOAT16_FIFTY = Binary.fromConstantByteArray(new byte[] {0x40, 0x52}); + + private static final Operators.BinaryColumn float16Column = binaryColumn("float16.column"); + + private static ColumnChunkMetaData getFloat16ColumnMeta( + org.apache.parquet.column.statistics.Statistics stats, long valueCount) { + return ColumnChunkMetaData.get( + ColumnPath.get("float16", "column"), + FLOAT16_TYPE, + CompressionCodecName.GZIP, + null, + new HashSet<>(List.of(Encoding.PLAIN)), + stats, + 0L, + 0L, + valueCount, + 0L, + 0L); + } + + @Test + public void testNaNFloat16AllNaN() { + // All non-null values are NaN + org.apache.parquet.column.statistics.Statistics allNanStats = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(FLOAT16_TYPE) + .withNumNulls(0) + .withNanCount(177) + .build(); + + ColumnChunkMetaData float16Meta = getFloat16ColumnMeta(allNanStats, 177L); + List metas = List.of(getIntColumnMeta(intStats, 177L), float16Meta); + + assertTrue(canDrop(eq(float16Column, FLOAT16_ONE), metas)); + assertFalse(canDrop(notEq(float16Column, FLOAT16_ONE), metas)); + assertFalse(canDrop(lt(float16Column, FLOAT16_ONE), metas)); + assertFalse(canDrop(ltEq(float16Column, FLOAT16_ONE), metas)); + assertFalse(canDrop(gt(float16Column, FLOAT16_ONE), metas)); + assertFalse(canDrop(gtEq(float16Column, FLOAT16_ONE), metas)); + assertTrue(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_ONE))), metas)); + + assertFalse(canDrop(eq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(notEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(lt(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(ltEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(gt(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(gtEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_NAN))), metas)); + } + + @Test + public void testNaNFloat16Mixed() { + // Mixed: nan_count=50, min=10, max=100 + org.apache.parquet.column.statistics.Statistics mixedStats = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(FLOAT16_TYPE) + .withMin(FLOAT16_TEN.getBytes()) + .withMax(FLOAT16_HUNDRED.getBytes()) + .withNumNulls(0) + .withNanCount(50) + .build(); + + ColumnChunkMetaData float16Meta = getFloat16ColumnMeta(mixedStats, 177L); + List metas = List.of(getIntColumnMeta(intStats, 177L), float16Meta); + + assertFalse(canDrop(eq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(notEq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(lt(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(ltEq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(gt(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(gtEq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_FIFTY))), metas)); + + assertFalse(canDrop(eq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(notEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(lt(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(ltEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(gt(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(gtEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_NAN))), metas)); + } + + @Test + public void testNaNFloat16ZeroNaNCount() { + // Zero NaN count with valid min/max + org.apache.parquet.column.statistics.Statistics zeroNanStats = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(FLOAT16_TYPE) + .withMin(FLOAT16_TEN.getBytes()) + .withMax(FLOAT16_HUNDRED.getBytes()) + .withNumNulls(0) + .withNanCount(0) + .build(); + + ColumnChunkMetaData float16Meta = getFloat16ColumnMeta(zeroNanStats, 177L); + List metas = List.of(getIntColumnMeta(intStats, 177L), float16Meta); + + assertFalse(canDrop(eq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(notEq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(lt(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(ltEq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(gt(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(gtEq(float16Column, FLOAT16_FIFTY), metas)); + assertFalse(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_FIFTY))), metas)); + + assertTrue(canDrop(eq(float16Column, FLOAT16_NAN), metas)); + assertTrue(canDrop(notEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(lt(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(ltEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(gt(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(gtEq(float16Column, FLOAT16_NAN), metas)); + assertFalse(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_NAN))), metas)); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 264017a1f0..d57caa2dd1 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -1961,4 +1961,150 @@ public void testEdgeInterpolationAlgorithmConversion() { assertNull(ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(null)); assertNull(ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(null)); } + + @Test + public void testIEEE754TotalOrderColumnOrder() throws IOException { + MessageType schema = Types.buildMessage() + .required(PrimitiveTypeName.FLOAT) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("float_ieee754") + .required(PrimitiveTypeName.DOUBLE) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("double_ieee754") + .named("Message"); + + org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = + new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap(), null); + ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList()); + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata); + + List columnOrders = formatMetadata.getColumn_orders(); + assertEquals(2, columnOrders.size()); + for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) { + assertTrue(columnOrder.isSetIEEE_754_TOTAL_ORDER()); + } + + MessageType resultSchema = + converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema(); + assertEquals( + ColumnOrder.ieee754TotalOrder(), + resultSchema.getType("float_ieee754").asPrimitiveType().columnOrder()); + assertEquals( + ColumnOrder.ieee754TotalOrder(), + resultSchema.getType("double_ieee754").asPrimitiveType().columnOrder()); + } + + @Test + public void testStatisticsNanCountRoundTripFloat() { + PrimitiveType type = Types.required(PrimitiveTypeName.FLOAT).named("test_float"); + FloatStatistics stats = (FloatStatistics) Statistics.createStats(type); + stats.updateStats(1.0f); + stats.updateStats(Float.NaN); + stats.updateStats(3.0f); + stats.updateStats(Float.NaN); + + org.apache.parquet.format.Statistics formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + assertTrue("nan_count should be set", formatStats.isSetNan_count()); + assertEquals(2, formatStats.getNan_count()); + + Statistics roundTrip = ParquetMetadataConverter.fromParquetStatisticsInternal( + Version.FULL_VERSION, formatStats, type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue(roundTrip.isNanCountSet()); + assertEquals(2, roundTrip.getNanCount()); + } + + @Test + public void testStatisticsNanCountRoundTripDouble() { + PrimitiveType type = Types.required(PrimitiveTypeName.DOUBLE).named("test_double"); + DoubleStatistics stats = (DoubleStatistics) Statistics.createStats(type); + stats.updateStats(1.0); + stats.updateStats(Double.NaN); + stats.updateStats(3.0); + + org.apache.parquet.format.Statistics formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + assertTrue("nan_count should be set", formatStats.isSetNan_count()); + assertEquals(1, formatStats.getNan_count()); + + Statistics roundTrip = ParquetMetadataConverter.fromParquetStatisticsInternal( + Version.FULL_VERSION, formatStats, type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue(roundTrip.isNanCountSet()); + assertEquals(1, roundTrip.getNanCount()); + } + + @Test + public void testStatisticsNanCountRoundTripFloat16() { + PrimitiveType type = Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(LogicalTypeAnnotation.float16Type()) + .named("test_float16"); + BinaryStatistics stats = (BinaryStatistics) Statistics.createStats(type); + // FLOAT16 1.0 = 0x3C00 + stats.updateStats(Binary.fromConstantByteArray(new byte[] {0x00, 0x3C})); + // FLOAT16 NaN = 0x7E00 + stats.updateStats(Binary.fromConstantByteArray(new byte[] {0x00, 0x7E})); + + org.apache.parquet.format.Statistics formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + assertTrue("nan_count should be set", formatStats.isSetNan_count()); + assertEquals(1, formatStats.getNan_count()); + + Statistics roundTrip = ParquetMetadataConverter.fromParquetStatisticsInternal( + Version.FULL_VERSION, formatStats, type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue(roundTrip.isNanCountSet()); + assertEquals(1, roundTrip.getNanCount()); + } + + @Test + public void testStatisticsNanCountZeroRoundTrip() { + PrimitiveType type = Types.required(PrimitiveTypeName.FLOAT).named("test_float"); + FloatStatistics stats = (FloatStatistics) Statistics.createStats(type); + stats.updateStats(1.0f); + stats.updateStats(2.0f); + + org.apache.parquet.format.Statistics formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + assertTrue("nan_count should be set even when zero", formatStats.isSetNan_count()); + assertEquals(0, formatStats.getNan_count()); + + Statistics roundTrip = ParquetMetadataConverter.fromParquetStatisticsInternal( + Version.FULL_VERSION, formatStats, type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue(roundTrip.isNanCountSet()); + assertEquals(0, roundTrip.getNanCount()); + } + + @Test + public void testColumnIndexNanCountsRoundTrip() { + PrimitiveType type = Types.required(PrimitiveTypeName.FLOAT) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("test_float"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + + // Page 1: mixed NaN and non-NaN + FloatStatistics stats1 = (FloatStatistics) Statistics.createStats(type); + stats1.updateStats(1.0f); + stats1.updateStats(Float.NaN); + stats1.updateStats(3.0f); + builder.add(stats1); + + // Page 2: all nulls + FloatStatistics stats2 = (FloatStatistics) Statistics.createStats(type); + stats2.incrementNumNulls(10); + builder.add(stats2); + + // Page 3: no NaN + FloatStatistics stats3 = (FloatStatistics) Statistics.createStats(type); + stats3.updateStats(5.0f); + stats3.updateStats(10.0f); + builder.add(stats3); + + ColumnIndex columnIndex = builder.build(); + org.apache.parquet.format.ColumnIndex parquetColumnIndex = + ParquetMetadataConverter.toParquetColumnIndex(type, columnIndex); + assertNotNull(parquetColumnIndex); + assertNotNull("nan_counts should be set", parquetColumnIndex.getNan_counts()); + assertEquals(List.of(1L, 0L, 0L), parquetColumnIndex.getNan_counts()); + + ColumnIndex roundTrip = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex); + assertNotNull(roundTrip); + assertEquals(List.of(1L, 0L, 0L), roundTrip.getNanCounts()); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestIeee754TotalOrderE2E.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestIeee754TotalOrderE2E.java new file mode 100644 index 0000000000..66cfbe74ee --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestIeee754TotalOrderE2E.java @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.statistics; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.Preconditions; +import org.apache.parquet.column.statistics.DoubleStatistics; +import org.apache.parquet.column.statistics.FloatStatistics; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.GroupFactory; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.schema.ColumnOrder; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestIeee754TotalOrderE2E { + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + private static final MessageType FLOAT_SCHEMA = Types.buildMessage() + .required(PrimitiveTypeName.FLOAT) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("float_col") + .named("msg"); + + private static final MessageType DOUBLE_SCHEMA = Types.buildMessage() + .required(PrimitiveTypeName.DOUBLE) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("double_col") + .named("msg"); + + private static final MessageType FLOAT_DOUBLE_SCHEMA = Types.buildMessage() + .required(PrimitiveTypeName.FLOAT) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("float_col") + .required(PrimitiveTypeName.DOUBLE) + .columnOrder(ColumnOrder.ieee754TotalOrder()) + .named("double_col") + .named("msg"); + + private Path newTempPath() throws IOException { + File file = temp.newFile(); + Preconditions.checkArgument(file.delete(), "Could not remove temp file"); + return new Path(file.getAbsolutePath()); + } + + private Path writeFloatFile(float... values) throws IOException { + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(FLOAT_SCHEMA) + .withDictionaryEncoding(false) + .build()) { + GroupFactory factory = new SimpleGroupFactory(FLOAT_SCHEMA); + for (float v : values) { + writer.write(factory.newGroup().append("float_col", v)); + } + } + return path; + } + + private Path writeDoubleFile(double... values) throws IOException { + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(DOUBLE_SCHEMA) + .withDictionaryEncoding(false) + .build()) { + GroupFactory factory = new SimpleGroupFactory(DOUBLE_SCHEMA); + for (double v : values) { + writer.write(factory.newGroup().append("double_col", v)); + } + } + return path; + } + + private Path writeFloatDoubleFile(float[] floatValues, double[] doubleValues) throws IOException { + Preconditions.checkArgument(floatValues.length == doubleValues.length, "Arrays must have same length"); + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(FLOAT_DOUBLE_SCHEMA) + .withDictionaryEncoding(false) + .build()) { + GroupFactory factory = new SimpleGroupFactory(FLOAT_DOUBLE_SCHEMA); + for (int i = 0; i < floatValues.length; i++) { + writer.write( + factory.newGroup().append("float_col", floatValues[i]).append("double_col", doubleValues[i])); + } + } + return path; + } + + private List readFloatValues(Path path) throws IOException { + return readFloatValues(path, null); + } + + private List readFloatValues(Path path, FilterCompat.Filter filter) throws IOException { + List result = new ArrayList<>(); + ParquetReader.Builder builder = ParquetReader.builder(new GroupReadSupport(), path); + if (filter != null) { + builder.withFilter(filter); + } + try (ParquetReader reader = builder.build()) { + Group group; + while ((group = reader.read()) != null) { + result.add(group.getFloat("float_col", 0)); + } + } + return result; + } + + private List readDoubleValues(Path path) throws IOException { + return readDoubleValues(path, null); + } + + private List readDoubleValues(Path path, FilterCompat.Filter filter) throws IOException { + List result = new ArrayList<>(); + ParquetReader.Builder builder = ParquetReader.builder(new GroupReadSupport(), path); + if (filter != null) { + builder.withFilter(filter); + } + try (ParquetReader reader = builder.build()) { + Group group; + while ((group = reader.read()) != null) { + result.add(group.getDouble("double_col", 0)); + } + } + return result; + } + + @Test + public void testFloatStatisticsWithNaN() throws IOException { + Path path = writeFloatFile(1.0f, Float.NaN, 3.0f, Float.NaN, 5.0f); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + BlockMetaData block = reader.getFooter().getBlocks().get(0); + ColumnChunkMetaData column = block.getColumns().get(0); + + // Verify column order is IEEE 754 + assertEquals( + ColumnOrder.ieee754TotalOrder(), column.getPrimitiveType().columnOrder()); + + // Verify statistics + Statistics stats = column.getStatistics(); + assertNotNull(stats); + assertTrue("nan_count should be set", stats.isNanCountSet()); + assertEquals("nan_count should be 2", 2, stats.getNanCount()); + + // Min/max should exclude NaN + FloatStatistics floatStats = (FloatStatistics) stats; + assertEquals(1.0f, floatStats.getMin(), 0.0f); + assertEquals(5.0f, floatStats.getMax(), 0.0f); + + // Verify column index + ColumnIndex columnIndex = reader.readColumnIndex(column); + assertNotNull("ColumnIndex should not be null for IEEE 754 with NaN", columnIndex); + List nanCounts = columnIndex.getNanCounts(); + assertNotNull("nan_counts should be set in column index", nanCounts); + // All values in one page, so one entry with nan_count = 2 + assertEquals(1, nanCounts.size()); + assertEquals(Long.valueOf(2), nanCounts.get(0)); + + // Verify min/max in column index exclude NaN + List minValues = columnIndex.getMinValues(); + List maxValues = columnIndex.getMaxValues(); + assertEquals(1, minValues.size()); + float ciMin = minValues.get(0).order(ByteOrder.LITTLE_ENDIAN).getFloat(0); + float ciMax = maxValues.get(0).order(ByteOrder.LITTLE_ENDIAN).getFloat(0); + assertEquals(1.0f, ciMin, 0.0f); + assertEquals(5.0f, ciMax, 0.0f); + } + } + + @Test + public void testDoubleStatisticsWithNaN() throws IOException { + // Write: -10.0, NaN, 20.0, NaN, NaN + Path path = writeDoubleFile(-10.0, Double.NaN, 20.0, Double.NaN, Double.NaN); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + BlockMetaData block = reader.getFooter().getBlocks().get(0); + ColumnChunkMetaData column = block.getColumns().get(0); + + assertEquals( + ColumnOrder.ieee754TotalOrder(), column.getPrimitiveType().columnOrder()); + + Statistics stats = column.getStatistics(); + assertTrue(stats.isNanCountSet()); + assertEquals(3, stats.getNanCount()); + + DoubleStatistics doubleStats = (DoubleStatistics) stats; + assertEquals(-10.0, doubleStats.getMin(), 0.0); + assertEquals(20.0, doubleStats.getMax(), 0.0); + + ColumnIndex columnIndex = reader.readColumnIndex(column); + assertNotNull(columnIndex); + List nanCounts = columnIndex.getNanCounts(); + assertNotNull(nanCounts); + assertEquals(1, nanCounts.size()); + assertEquals(Long.valueOf(3), nanCounts.get(0)); + } + } + + @Test + public void testFloatStatisticsNoNaN() throws IOException { + Path path = writeFloatFile(1.0f, 2.0f, 3.0f); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + ColumnChunkMetaData column = + reader.getFooter().getBlocks().get(0).getColumns().get(0); + Statistics stats = column.getStatistics(); + + assertTrue(stats.isNanCountSet()); + assertEquals(0, stats.getNanCount()); + + FloatStatistics floatStats = (FloatStatistics) stats; + assertEquals(1.0f, floatStats.getMin(), 0.0f); + assertEquals(3.0f, floatStats.getMax(), 0.0f); + } + } + + @Test + public void testFloatStatisticsAllNaN() throws IOException { + Path path = writeFloatFile(Float.NaN, Float.NaN, Float.NaN); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + ColumnChunkMetaData column = + reader.getFooter().getBlocks().get(0).getColumns().get(0); + Statistics stats = column.getStatistics(); + + assertTrue(stats.isNanCountSet()); + assertEquals(3, stats.getNanCount()); + + assertTrue("All-NaN column should have non-null min/max values", stats.hasNonNullValue()); + assertTrue("All-NaN min should be NaN", Float.isNaN(((FloatStatistics) stats).getMin())); + assertTrue("All-NaN max should be NaN", Float.isNaN(((FloatStatistics) stats).getMax())); + } + } + + @Test + public void testFloatDoubleColumnsWithNaN() throws IOException { + float[] floatValues = {1.0f, Float.NaN, 3.0f}; + double[] doubleValues = {-5.0, Double.NaN, 10.0}; + Path path = writeFloatDoubleFile(floatValues, doubleValues); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + BlockMetaData block = reader.getFooter().getBlocks().get(0); + + // Verify float column + ColumnChunkMetaData floatCol = block.getColumns().get(0); + assertEquals( + ColumnOrder.ieee754TotalOrder(), floatCol.getPrimitiveType().columnOrder()); + Statistics floatStats = floatCol.getStatistics(); + assertTrue(floatStats.isNanCountSet()); + assertEquals(1, floatStats.getNanCount()); + assertEquals(1.0f, ((FloatStatistics) floatStats).getMin(), 0.0f); + assertEquals(3.0f, ((FloatStatistics) floatStats).getMax(), 0.0f); + + // Verify double column + ColumnChunkMetaData doubleCol = block.getColumns().get(1); + assertEquals( + ColumnOrder.ieee754TotalOrder(), + doubleCol.getPrimitiveType().columnOrder()); + Statistics doubleStats = doubleCol.getStatistics(); + assertTrue(doubleStats.isNanCountSet()); + assertEquals(1, doubleStats.getNanCount()); + assertEquals(-5.0, ((DoubleStatistics) doubleStats).getMin(), 0.0); + assertEquals(10.0, ((DoubleStatistics) doubleStats).getMax(), 0.0); + + // Verify column indexes for both + ColumnIndex floatCI = reader.readColumnIndex(floatCol); + assertNotNull(floatCI); + assertNotNull(floatCI.getNanCounts()); + assertEquals(Long.valueOf(1), floatCI.getNanCounts().get(0)); + + ColumnIndex doubleCI = reader.readColumnIndex(doubleCol); + assertNotNull(doubleCI); + assertNotNull(doubleCI.getNanCounts()); + assertEquals(Long.valueOf(1), doubleCI.getNanCounts().get(0)); + } + } +} diff --git a/pom.xml b/pom.xml index d27788932c..149bc9f9c3 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,7 @@ shaded.parquet 3.3.0 - 2.12.0 + 2.13.0-SNAPSHOT 1.17.0 thrift ${thrift.executable}