diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java index 53526ae8d0..99bb13c20f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java @@ -79,6 +79,10 @@ public abstract class DictionaryValuesWriter extends ValuesWriter implements Req /* will become true if the dictionary becomes too big */ protected boolean dictionaryTooBig; + /* set to true when the dictionary exceeds maxDictionaryByteSize or MAX_DICTIONARY_ENTRIES, + * checked by shouldFallBack() to avoid repeated virtual dispatch to getDictionarySize() on every write */ + private boolean dictionarySizeExceeded; + /* current size in bytes the dictionary will take once serialized */ protected long dictionaryByteSize; @@ -121,8 +125,20 @@ protected DictionaryPage dictPage(ValuesWriter dictPageWriter) { @Override public boolean shouldFallBack() { - // if the dictionary reaches the max byte size or the values can not be encoded on 4 bytes anymore. - return dictionaryByteSize > maxDictionaryByteSize || getDictionarySize() > MAX_DICTIONARY_ENTRIES; + return dictionarySizeExceeded; + } + + /** + * Called by subclass write methods after adding a new dictionary entry to check if the dictionary + * has exceeded its size limits. This avoids the per-value virtual dispatch overhead of calling + * getDictionarySize() on every write -- the check only runs when a new entry is actually added. + * + * @param newDictionarySize the current dictionary size after adding the new entry + */ + protected void checkDictionarySizeLimit(int newDictionarySize) { + if (dictionaryByteSize > maxDictionaryByteSize || newDictionarySize > MAX_DICTIONARY_ENTRIES) { + dictionarySizeExceeded = true; + } } @Override @@ -208,6 +224,7 @@ public void resetDictionary() { lastUsedDictionaryByteSize = 0; lastUsedDictionarySize = 0; dictionaryTooBig = false; + dictionarySizeExceeded = false; clearDictionaryContent(); } @@ -250,6 +267,7 @@ public void writeBytes(Binary v) { binaryDictionaryContent.put(v.copy(), id); // length as int (4 bytes) + actual bytes dictionaryByteSize += 4L + v.length(); + checkDictionarySizeLimit(id + 1); } encodedValues.add(id); } @@ -320,6 +338,7 @@ public void writeBytes(Binary value) { id = binaryDictionaryContent.size(); binaryDictionaryContent.put(value.copy(), id); dictionaryByteSize += length; + checkDictionarySizeLimit(id + 1); } encodedValues.add(id); } @@ -364,6 +383,7 @@ public void writeLong(long v) { id = longDictionaryContent.size(); longDictionaryContent.put(v, id); dictionaryByteSize += 8; + checkDictionarySizeLimit(id + 1); } encodedValues.add(id); } @@ -435,6 +455,7 @@ public void writeDouble(double v) { id = doubleDictionaryContent.size(); doubleDictionaryContent.put(v, id); dictionaryByteSize += 8; + checkDictionarySizeLimit(id + 1); } encodedValues.add(id); } @@ -506,6 +527,7 @@ public void writeInteger(int v) { id = intDictionaryContent.size(); intDictionaryContent.put(v, id); dictionaryByteSize += 4; + checkDictionarySizeLimit(id + 1); } encodedValues.add(id); } @@ -578,6 +600,7 @@ public void writeFloat(float v) { id = floatDictionaryContent.size(); floatDictionaryContent.put(v, id); dictionaryByteSize += 4; + checkDictionarySizeLimit(id + 1); } encodedValues.add(id); } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index a91f807e73..63b7d185c7 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -681,6 +681,29 @@ public void testZeroValues() throws IOException { } } + @Test + public void testCheckDictionarySizeLimitExceedsByEntryCount() { + // Use a large maxDictionaryByteSize so only the entry-count limit can trigger + int maxDictionaryByteSize = Integer.MAX_VALUE; + try (PlainIntegerDictionaryValuesWriter writer = new PlainIntegerDictionaryValuesWriter( + maxDictionaryByteSize, PLAIN_DICTIONARY, PLAIN_DICTIONARY, allocator)) { + + assertFalse("should not fall back initially", writer.shouldFallBack()); + + // At the limit (Integer.MAX_VALUE - 1 entries): should NOT trigger + writer.checkDictionarySizeLimit(Integer.MAX_VALUE - 1); + assertFalse("should not fall back when entry count equals MAX_DICTIONARY_ENTRIES", writer.shouldFallBack()); + + // Exceeding the limit (Integer.MAX_VALUE entries): should trigger + writer.checkDictionarySizeLimit(Integer.MAX_VALUE); + assertTrue("should fall back when entry count exceeds MAX_DICTIONARY_ENTRIES", writer.shouldFallBack()); + + // resetDictionary clears the flag + writer.resetDictionary(); + assertFalse("should not fall back after resetDictionary", writer.shouldFallBack()); + } + } + @Test public void testBooleanDictionary() throws IOException { // Create a dictionary page with boolean values (false, true)