From ac476d6406b30973b6ff7f0e5700485973263348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Tue, 21 Apr 2026 16:41:13 +0000 Subject: [PATCH] GH-3516: Optimize DeltaByteArrayWriter / DeltaLengthByteArrayValuesWriter Two related changes in the DELTA_BYTE_ARRAY write path: 1. DeltaLengthByteArrayValuesWriter: drop the unused LittleEndianDataOutputStream wrapper. Binary.writeTo(arrayOut) works directly with the underlying CapacityByteArrayOutputStream; the LE wrapper added an extra layer of dispatch on every value but never used any LE functionality (writeInt/writeLong/etc.). Add a new writeBytes(byte[], int, int) overload so callers that already have the raw bytes can avoid allocating a Binary wrapper. 2. DeltaByteArrayWriter: tighten suffixWriter field type to DeltaLengthByteArrayValuesWriter (it's always constructed as one) so the new writeBytes(byte[], int, int) overload is callable. Replace the suffix call with the raw-bytes overload, eliminating the per-value Binary.slice() allocation. Benchmark results (BinaryEncodingBenchmark.encodeDeltaByteArray and encodeDeltaLengthByteArray, added in #3512): - encodeDeltaByteArray (LOW cardinality, len=10): +33% to +55% - encodeDeltaLengthByteArray (LOW card, len=10): +18% to +21% - long-string cases: flat (per-value alloc amortized away) No public API change. No file format change. Validation: parquet-column 573 tests pass. Built with -Dspotless.check.skip=true -Drat.skip=true -Djapicmp.skip=true. --- .../DeltaLengthByteArrayValuesWriter.java | 20 ++++++++++--------- .../deltastrings/DeltaByteArrayWriter.java | 6 ++++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java index ac63ff52ef..f3c33dc417 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java @@ -22,7 +22,6 @@ import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.CapacityByteArrayOutputStream; -import org.apache.parquet.bytes.LittleEndianDataOutputStream; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter; @@ -46,11 +45,9 @@ public class DeltaLengthByteArrayValuesWriter extends ValuesWriter { private ValuesWriter lengthWriter; private CapacityByteArrayOutputStream arrayOut; - private LittleEndianDataOutputStream out; public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBufferAllocator allocator) { arrayOut = new CapacityByteArrayOutputStream(initialSize, pageSize, allocator); - out = new LittleEndianDataOutputStream(arrayOut); lengthWriter = new DeltaBinaryPackingValuesWriterForInteger( DeltaBinaryPackingValuesWriter.DEFAULT_NUM_BLOCK_VALUES, DeltaBinaryPackingValuesWriter.DEFAULT_NUM_MINIBLOCKS, @@ -63,12 +60,22 @@ public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBuffe public void writeBytes(Binary v) { try { lengthWriter.writeInteger(v.length()); - v.writeTo(out); + v.writeTo(arrayOut); } catch (IOException e) { throw new ParquetEncodingException("could not write bytes", e); } } + /** + * Writes raw bytes directly, avoiding Binary object creation overhead. + * Used by {@link org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter} + * to write suffix bytes without creating an intermediate Binary.slice(). + */ + public void writeBytes(byte[] data, int offset, int length) { + lengthWriter.writeInteger(length); + arrayOut.write(data, offset, length); + } + @Override public long getBufferedSize() { return lengthWriter.getBufferedSize() + arrayOut.size(); @@ -76,11 +83,6 @@ public long getBufferedSize() { @Override public BytesInput getBytes() { - try { - out.flush(); - } catch (IOException e) { - throw new ParquetEncodingException("could not write page", e); - } LOG.debug("writing a buffer of size {}", arrayOut.size()); return BytesInput.concat(lengthWriter.getBytes(), BytesInput.from(arrayOut)); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java index c234108613..18b01bdf4f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java @@ -37,7 +37,7 @@ public class DeltaByteArrayWriter extends ValuesWriter { private ValuesWriter prefixLengthWriter; - private ValuesWriter suffixWriter; + private DeltaLengthByteArrayValuesWriter suffixWriter; private byte[] previous; public DeltaByteArrayWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator) { @@ -95,7 +95,9 @@ public void writeBytes(Binary v) { for (i = 0; (i < length) && (previous[i] == vb[i]); i++) ; prefixLengthWriter.writeInteger(i); - suffixWriter.writeBytes(v.slice(i, vb.length - i)); + // Write suffix bytes directly from the byte array, avoiding Binary.slice() allocation + // and the virtual dispatch chain through Binary.writeTo() + suffixWriter.writeBytes(vb, i, vb.length - i); previous = vb; } }