Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
package org.apache.parquet.column.statistics;

import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.ColumnOrder;
import org.apache.parquet.schema.Float16;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;

Expand All @@ -28,6 +31,7 @@ public class BinaryStatistics extends Statistics<Binary> {
private static final PrimitiveType DEFAULT_FAKE_TYPE =
Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("fake_binary_type");

private final boolean isFloat16;
private Binary max;
private Binary min;

Expand All @@ -41,26 +45,51 @@ public BinaryStatistics() {

BinaryStatistics(PrimitiveType type) {
super(type);
this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
if (isFloat16) {
incrementNanCount(0);
}
}

private BinaryStatistics(BinaryStatistics other) {
super(other.type());
this.isFloat16 = other.isFloat16;
if (other.hasNonNullValue()) {
initializeStats(other.min, other.max);
}
setNumNulls(other.getNumNulls());
incrementNanCount(other.getNanCount());
}

@Override
public void updateStats(Binary value) {
if (isFloat16 && Float16.isNaN(value.get2BytesLittleEndian())) {
incrementNanCount();
}
if (!this.hasNonNullValue()) {
min = value.copy();
max = value.copy();
this.markAsNotEmpty();
} else if (comparator().compare(min, value) > 0) {
min = value.copy();
} else if (comparator().compare(max, value) < 0) {
max = value.copy();
} else {
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
if (!Float16.isNaN(value.get2BytesLittleEndian())) {
if (Float16.isNaN(min.get2BytesLittleEndian())
|| comparator().compare(min, value) > 0) {
min = value.copy();
}
if (Float16.isNaN(max.get2BytesLittleEndian())
|| comparator().compare(max, value) < 0) {
max = value.copy();
}
}
return;
}

if (comparator().compare(min, value) > 0) {
min = value.copy();
} else if (comparator().compare(max, value) < 0) {
max = value.copy();
}
}
}

Expand Down Expand Up @@ -126,6 +155,20 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) {
*/
@Deprecated
public void updateStats(Binary min_value, Binary max_value) {
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
if (!Float16.isNaN(min_value.get2BytesLittleEndian())) {
if (Float16.isNaN(min.get2BytesLittleEndian()) || comparator().compare(min, min_value) > 0) {
min = min_value.copy();
}
}
if (!Float16.isNaN(max_value.get2BytesLittleEndian())) {
if (Float16.isNaN(max.get2BytesLittleEndian()) || comparator().compare(max, max_value) < 0) {
max = max_value.copy();
}
}
return;
}

if (comparator().compare(min, min_value) > 0) {
min = min_value.copy();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.parquet.column.statistics;

import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.schema.ColumnOrder;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;

Expand All @@ -41,6 +42,7 @@ public DoubleStatistics() {

DoubleStatistics(PrimitiveType type) {
super(type);
incrementNanCount(0);
}

private DoubleStatistics(DoubleStatistics other) {
Expand All @@ -49,10 +51,14 @@ private DoubleStatistics(DoubleStatistics other) {
initializeStats(other.min, other.max);
}
setNumNulls(other.getNumNulls());
incrementNanCount(other.getNanCount());
}

@Override
public void updateStats(double value) {
if (Double.isNaN(value)) {
incrementNanCount();
}
if (!this.hasNonNullValue()) {
initializeStats(value, value);
} else {
Expand Down Expand Up @@ -98,6 +104,20 @@ public boolean isSmallerThan(long size) {
}

public void updateStats(double min_value, double max_value) {
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
if (!Double.isNaN(min_value)) {
if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) {
min = min_value;
}
}
if (!Double.isNaN(max_value)) {
if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) {
max = max_value;
}
}
return;
}

if (comparator().compare(min, min_value) > 0) {
min = min_value;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.parquet.column.statistics;

import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.schema.ColumnOrder;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;

Expand All @@ -42,6 +43,7 @@ public FloatStatistics() {

FloatStatistics(PrimitiveType type) {
super(type);
incrementNanCount(0);
}

private FloatStatistics(FloatStatistics other) {
Expand All @@ -50,10 +52,14 @@ private FloatStatistics(FloatStatistics other) {
initializeStats(other.min, other.max);
}
setNumNulls(other.getNumNulls());
incrementNanCount(other.getNanCount());
}

@Override
public void updateStats(float value) {
if (Float.isNaN(value)) {
incrementNanCount();
}
if (!this.hasNonNullValue()) {
initializeStats(value, value);
} else {
Expand Down Expand Up @@ -99,6 +105,20 @@ public boolean isSmallerThan(long size) {
}

public void updateStats(float min_value, float max_value) {
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
if (!Float.isNaN(min_value)) {
if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
min = min_value;
}
}
Comment on lines +109 to +113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!Float.isNaN(min_value)) {
if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
min = min_value;
}
}
if (!Float.isNaN(min_value)) {
if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
min = min_value;
}
} else if (Float.isNan(min) && comparator().compare(min, min_value) > 0) {
min = min_value;
}

IIUC the wording of the most recent proposal is that the statistics must contain the min and max NaN values for an all-NaN page/chunk. I think you need to still update the stats for an incoming NaN if the current min/max is NaN.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a comment from @rdblue to simply use a standard NaN: apache/parquet-format#514 (comment)

IMO, currently NaN values are just sentinels for min/max to indicate a all-NaN page/chunk. We should be conservative not to depend on the order of NaN values for filtering.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there's the comment, but the wording hasn't yet changed. And given the other comments in that thread I'm not sure it will be changed. But at this stage in the process I guess it doesn't matter.

if (!Float.isNaN(max_value)) {
if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) {
max = max_value;
}
}
return;
}

if (comparator().compare(min, min_value) > 0) {
min = min_value;
}
Expand Down
Loading
Loading