diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java index e78b2ceae1..bd9073e262 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java @@ -616,6 +616,9 @@ public static ColumnIndex build( builder.fill(nullPages, nullCounts, minValues, maxValues, repLevelHistogram, defLevelHistogram); ColumnIndexBase columnIndex = builder.build(type); + if (columnIndex == null) { + return null; + } columnIndex.boundaryOrder = requireNonNull(boundaryOrder); return columnIndex; } @@ -750,6 +753,19 @@ private ColumnIndexBase build(PrimitiveType type) { if (!nullCounts.isEmpty()) { columnIndex.nullCounts = nullCounts.toLongArray(); } + // A null_pages entry of true indicates the page consists entirely of null values. + // When null_counts is present, a null_counts entry of zero means the page has no + // null values at all. These two fields directly contradict each other: a page + // cannot both consist entirely of nulls and contain zero nulls. Since null_pages + // is the field that causes the reader to skip pages during column index filtering, + // the safe response is to discard the column index for this column entirely. + if (columnIndex.nullCounts != null) { + for (int i = 0; i < columnIndex.nullPages.length; i++) { + if (columnIndex.nullPages[i] && columnIndex.nullCounts[i] <= 0L) { + return null; + } + } + } columnIndex.pageIndexes = pageIndexes.toIntArray(); // Repetition and definition level histograms are optional so keep them null if the builder has no values if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) { diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java index 415a21e335..658db8593f 100644 --- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java @@ -250,9 +250,11 @@ public String toString() { RAND_DESCENDING = (ColumnIndexBase) builder.build(); builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE); - // Add a couple of empty stats so column index will contain null pages only + // Add a couple of all-null stats so column index will contain null pages only for (int i = 0; i < 10; ++i) { - builder.add(Statistics.createStats(TYPE)); + Statistics nullStats = Statistics.createStats(TYPE); + nullStats.incrementNumNulls(10); + builder.add(nullStats); } ALL_NULL_PAGES = (ColumnIndexBase) builder.build(); } diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java index 6274f36263..96c66ea52e 100644 --- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java @@ -904,11 +904,11 @@ public void testStaticBuildBoolean() { Types.required(BOOLEAN).named("test_boolean"), BoundaryOrder.DESCENDING, List.of(false, true, false, true, false, true), - List.of(9l, 8l, 7l, 6l, 5l, 0l), + List.of(9l, 8l, 7l, 6l, 5l, 4l), toBBList(false, null, false, null, true, null), toBBList(true, null, false, null, true, null)); assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); - assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 0); + assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 4); assertCorrectNullPages(columnIndex, false, true, false, true, false, true); assertCorrectValues(columnIndex.getMaxValues(), true, null, false, null, true, null); assertCorrectValues(columnIndex.getMinValues(), false, null, false, null, true, null); @@ -1865,4 +1865,152 @@ long getMinMaxSize() { private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate predicate, int... expectedIndexes) { TestIndexIterator.assertEquals(predicate.accept(ci), expectedIndexes); } + + @Test + public void testBuildReturnsNullForNullPageCountContradiction() { + // null_pages[i]=true indicates the page is entirely null, but null_counts[i]=0 + // says there are zero null values. This contradiction indicates invalid metadata. + // build() should return null to prevent incorrect page skipping. + PrimitiveType type = Types.required(INT32).named("test_col"); + + // Pages 1-3 have null_pages=true with null_counts=0 — contradictory + assertNull( + "Column index with null_pages=true and null_counts=0 should be rejected", + ColumnIndexBuilder.build( + type, + BoundaryOrder.ASCENDING, + List.of(false, true, true, true), + List.of(0L, 0L, 0L, 0L), + toBBList(Integer.valueOf(-99), null, null, null), + toBBList(Integer.valueOf(5), null, null, null))); + + // Contradiction on a single page (last page) should also be rejected + assertNull( + "Single contradictory page should cause rejection", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false, false, true), + List.of(0L, 5L, 0L), + toBBList(Integer.valueOf(1), Integer.valueOf(50), null), + toBBList(Integer.valueOf(49), Integer.valueOf(99), null))); + + // Contradiction on the first page + assertNull( + "Contradictory first page should cause rejection", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true, false, false), + List.of(0L, 0L, 5L), + toBBList(null, Integer.valueOf(1), Integer.valueOf(50)), + toBBList(null, Integer.valueOf(49), Integer.valueOf(99)))); + + // Single page with contradiction + assertNull( + "Single-page column index with contradiction should be rejected", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true), + List.of(0L), + toBBList((Integer) null), + toBBList((Integer) null))); + + // All pages are contradictory null pages + assertNull( + "All-contradictory column index should be rejected", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true, true, true), + List.of(0L, 0L, 0L), + toBBList((Integer) null, null, null), + toBBList((Integer) null, null, null))); + } + + @Test + public void testBuildPreservesValidColumnIndex() { + PrimitiveType type = Types.required(INT32).named("test_col"); + + // Legitimate null page: null_pages=true with null_counts > 0 — valid + ColumnIndex ci = ColumnIndexBuilder.build( + type, + BoundaryOrder.ASCENDING, + List.of(false, true, false), + List.of(0L, 100L, 0L), + toBBList(Integer.valueOf(1), null, Integer.valueOf(50)), + toBBList(Integer.valueOf(49), null, Integer.valueOf(99))); + assertCorrectNullPages(ci, false, true, false); + assertCorrectNullCounts(ci, 0, 100, 0); + assertCorrectValues(ci.getMinValues(), 1, null, 50); + assertCorrectValues(ci.getMaxValues(), 49, null, 99); + + // All non-null pages — valid + ColumnIndex ci2 = ColumnIndexBuilder.build( + type, + BoundaryOrder.ASCENDING, + List.of(false, false, false), + List.of(0L, 5L, 10L), + toBBList(Integer.valueOf(1), Integer.valueOf(50), Integer.valueOf(100)), + toBBList(Integer.valueOf(49), Integer.valueOf(99), Integer.valueOf(150))); + assertCorrectNullPages(ci2, false, false, false); + assertCorrectNullCounts(ci2, 0, 5, 10); + + // Single non-null page + ColumnIndex ci3 = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false), + List.of(0L), + toBBList(Integer.valueOf(42)), + toBBList(Integer.valueOf(42))); + assertCorrectNullPages(ci3, false); + assertCorrectNullCounts(ci3, 0); + + // Single legitimate all-null page (null_pages=true, null_counts > 0) + ColumnIndex ci4 = ColumnIndexBuilder.build( + type, BoundaryOrder.UNORDERED, List.of(true), List.of(50L), toBBList((Integer) null), toBBList((Integer) + null)); + assertCorrectNullPages(ci4, true); + assertCorrectNullCounts(ci4, 50); + + // All pages legitimately null + ColumnIndex ci5 = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true, true, true), + List.of(10L, 20L, 30L), + toBBList((Integer) null, null, null), + toBBList((Integer) null, null, null)); + assertCorrectNullPages(ci5, true, true, true); + assertCorrectNullCounts(ci5, 10, 20, 30); + + // Boundary: null_counts=1 on a null page (minimum valid value) — should NOT be rejected + ColumnIndex ci6 = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false, true), + List.of(0L, 1L), + toBBList(Integer.valueOf(1), null), + toBBList(Integer.valueOf(99), null)); + assertCorrectNullPages(ci6, false, true); + assertCorrectNullCounts(ci6, 0, 1); + } + + @Test + public void testBuildWithoutNullCountsIsNotRejected() { + PrimitiveType type = Types.required(INT32).named("test_col"); + + // null_counts absent (optional field) — cannot detect contradiction, should build normally + ColumnIndex ci = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false, true, true), + null, + toBBList(Integer.valueOf(1), null, null), + toBBList(Integer.valueOf(99), null, null)); + assertCorrectNullPages(ci, false, true, true); + assertNull("null_counts should be null when not provided", ci.getNullCounts()); + } }