apache · andygrove · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -408,6 +408,7 @@ jobs:
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
+              org.apache.comet.CometRegExpJvmSuite
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite

diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -248,6 +248,7 @@ jobs:
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite
+              org.apache.comet.CometRegExpJvmSuite
               org.apache.comet.CometCodegenSuite
               org.apache.comet.CometCodegenSourceSuite
               org.apache.comet.CometCodegenHOFSuite

diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ output
 docs/comet-*/
 docs/build/
 docs/temp/
+docs/superpowers/
diff --git a/docs/source/user-guide/latest/compatibility/regex.md b/docs/source/user-guide/latest/compatibility/regex.md
@@ -19,6 +19,117 @@ under the License.
 
 # Regular Expressions
 
-Comet uses the Rust regexp crate for evaluating regular expressions, and this has different behavior from Java's
-regular expression engine. Comet will fall back to Spark for patterns that are known to produce different results, but
-this can be overridden by setting `spark.comet.expression.regexp.allowIncompatible=true`.
+Comet provides two regexp engines for evaluating regular expressions: a **Rust engine** that uses the Rust
+[`regex`] crate natively, and a **Java engine** that runs Spark's own `doGenCode` for the
+expression inside Comet's Arrow-direct codegen dispatcher (the same dispatcher used by Comet's
+`ScalaUDF` codegen path). The engine is selected with `spark.comet.exec.regexp.engine`, which accepts:
+
+- `java` (default) — route through the Java engine for full Spark compatibility. Requires
+  `spark.comet.exec.scalaUDF.codegen.enabled=true`; otherwise regex expressions fall back to Spark with
+  an explanatory message.
+- `rust` — run the Rust engine when an expression has a native implementation. Setting this is itself
+  the opt-in for the semantic differences between Java and Rust regex (no separate `allowIncompatible`
+  flag needed). Expressions without a native Rust implementation (`regexp_extract`,
+  `regexp_extract_all`, `regexp_instr`) fall through to the Java engine so users still get Comet
+  acceleration with full Spark semantics.
+
+With `engine=java` and `scalaUDF.codegen.enabled=true`, all regex expressions run on the Comet
+path with full Spark compatibility.
+
+## Disabling Comet for individual regex expressions
+
+Each regex expression has a per-class `spark.comet.expression.<ClassName>.enabled` flag (default
+`true`) that disables Comet's serde for that expression and forces a Spark fallback. This is
+useful for narrowing a regression or comparing performance on a single operator without changing
+the engine selector:
+
+| Expression           | Config                                                  |
+| -------------------- | ------------------------------------------------------- |
+| `rlike`              | `spark.comet.expression.RLike.enabled=false`            |
+| `regexp_extract`     | `spark.comet.expression.RegExpExtract.enabled=false`    |
+| `regexp_extract_all` | `spark.comet.expression.RegExpExtractAll.enabled=false` |
+| `regexp_instr`       | `spark.comet.expression.RegExpInStr.enabled=false`      |
+| `regexp_replace`     | `spark.comet.expression.RegExpReplace.enabled=false`    |
+| `split`              | `spark.comet.expression.StringSplit.enabled=false`      |
+
+## Choosing an engine
+
+|                      | Rust engine                                                                                                         | Java engine (default)                                                                                               |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| **Compatibility**    | Differs from Java regex (see below)                                                                                 | 100% compatible with Spark                                                                                          |
+| **Feature coverage** | `rlike`, `regexp_replace`, `split` natively; `regexp_extract`, `regexp_extract_all`, `regexp_instr` via fallthrough | All regexp expressions (`rlike`, `regexp_extract`, `regexp_extract_all`, `regexp_instr`, `regexp_replace`, `split`) |
+| **Performance**      | Fully native, no JNI overhead                                                                                       | One JNI round-trip per batch (Arrow vectors stay columnar)                                                          |
+| **Pattern support**  | Linear-time subset only                                                                                             | All Java regex features (backreferences, lookaround, etc.)                                                          |
+
+The **Rust engine** is faster but cannot match Java regex semantics for every pattern. Because the engine
+choice is itself the opt-in, setting `spark.comet.exec.regexp.engine=rust` declares acceptance of those
+differences without a separate per-expression flag.
+
+The **Java engine** is the default and is gated behind `spark.comet.exec.scalaUDF.codegen.enabled`
+so the codegen dispatcher can be disabled globally without changing the regex engine selector.
+
+## Why the engines differ
+
+Java's `java.util.regex` is a backtracking engine in the Perl/PCRE family. It supports the full range of
+features that style of engine provides, including some whose worst-case running time grows exponentially with
+the input.
+
+Rust's [`regex`] crate is a finite-automaton engine in the [RE2] family. It deliberately omits features that
+cannot be implemented with a guarantee of linear-time matching. In exchange, every pattern it does accept runs
+in time linear in the size of the input. This is the same trade-off RE2, Go's `regexp`, and several other
+engines make.
+
+The practical consequence is that Java accepts a strictly larger set of patterns than the Rust engine, and
+several constructs that look the same in source have different semantics on the two sides.
+
+## Features supported by Java but not by the Rust engine
+
+Patterns that use any of the following will not compile in Comet's Rust engine and must run on Spark (or use
+the Java engine):
+
+- **Backreferences** such as `\1`, `\2`, or `\k<name>`. The Rust engine has no backtracking and cannot match
+  a previously captured group.
+- **Lookaround**, including lookahead (`(?=...)`, `(?!...)`) and lookbehind (`(?<=...)`, `(?<!...)`).
+- **Atomic groups** (`(?>...)`).
+- **Possessive quantifiers** (`*+`, `++`, `?+`, `{n,m}+`). Rust supports greedy and lazy quantifiers but not
+  possessive.
+- **Embedded code, conditionals, and recursion** such as `(?(cond)yes|no)` or `(?R)`. Rust accepts none of
+  these.
+
+## Features that exist on both sides but behave differently
+
+Even where both engines accept a construct, the matching behavior is not always the same.
+
+- **Unicode-aware character classes.** In the Rust engine, `\d`, `\w`, `\s`, and `.` are Unicode-aware by
+  default, so `\d` matches every digit codepoint defined by Unicode rather than only `0`-`9`. Java's defaults
+  match ASCII only and require the `UNICODE_CHARACTER_CLASS` flag (or `(?U)` inline) to switch to Unicode
+  semantics. The same pattern can therefore match a different set of characters on each side.
+- **Line terminators.** In multiline mode, Java treats `\r`, `\n`, `\r\n`, and a few additional Unicode line
+  separators as line boundaries by default. The Rust engine treats only `\n` as a line boundary unless CRLF
+  mode is enabled. `^`, `$`, and `.` (with `(?s)` off) all depend on this definition.
+- **Case-insensitive matching.** Both engines support `(?i)`, but Java's default is ASCII case folding while
+  the Rust engine uses full Unicode simple case folding when Unicode mode is on. Patterns that match characters
+  outside ASCII can produce different results.
+- **POSIX character classes.** The Rust engine supports `[[:alpha:]]` style POSIX classes inside bracket
+  expressions but not Java's `\p{Alpha}` shorthand. Java accepts both. Unicode property escapes (`\p{L}`,
+  `\p{Greek}`, etc.) are supported by both engines but cover slightly different sets of properties.
+- **Octal and Unicode escapes.** Java accepts `\0nnn` for octal and `\uXXXX` for a BMP codepoint. Rust uses
+  `\x{...}` for arbitrary codepoints and does not accept Java's bare `\uXXXX` form.
+- **Empty matches in `split`.** Spark's `StringSplit`, which is built on Java's regex, includes leading empty
+  strings produced by zero-width matches at the start of the input. The Rust engine's `split` follows different
+  rules, so split results can differ in edge cases involving empty matches even when the pattern itself is
+  identical on both sides.
+
+## When the Rust engine is safe
+
+For most ASCII-only, non-anchored patterns that use only literal characters, simple character classes, and
+ordinary quantifiers, the two engines produce the same results. If you are confident your patterns fit this
+shape and want to avoid the JNI overhead of the Java engine, switching to the Rust engine with
+`allowIncompatible=true` is generally safe.
+
+For anything that uses backreferences, lookaround, or relies on Java's specific Unicode or line-handling
+defaults, use the Java engine.
+
+[`java.util.regex`]: https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
+[`regex`]: https://docs.rs/regex/latest/regex/
+[RE2]: https://github.com/google/re2/wiki/Syntax
diff --git a/pom.xml b/pom.xml
@@ -1170,6 +1170,7 @@ under the License.
             <exclude>native/proto/src/generated/**</exclude>
             <exclude>benchmarks/tpc/queries/**</exclude>
             <exclude>.claude/**</exclude>
+            <exclude>docs/superpowers/**</exclude>
           </excludes>
         </configuration>
       </plugin>

diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala
@@ -369,10 +369,34 @@ object CometConf extends ShimCometConf {
         "Arrow-direct codegen dispatcher. When enabled, a supported ScalaUDF is compiled into " +
         "a per-batch kernel that reads and writes Arrow vectors directly from native " +
         "execution. When disabled, plans containing a ScalaUDF fall back to Spark for the " +
-        "enclosing operator.")
+        "enclosing operator. The same dispatcher backs `spark.comet.exec.regexp.engine=java` " +
+        "so the regex family routes through it as well.")
       .booleanConf
       .createWithDefault(false)
 
+  val REGEXP_ENGINE_RUST = "rust"
+  val REGEXP_ENGINE_JAVA = "java"
+
+  val COMET_REGEXP_ENGINE: ConfigEntry[String] =
+    conf("spark.comet.exec.regexp.engine")
+      .category(CATEGORY_EXEC)
+      .doc(
+        "Selects the engine used to evaluate Spark regular-expression expressions. " +
+          s"`$REGEXP_ENGINE_JAVA` (default) routes through the Arrow-direct codegen dispatcher " +
+          "so Spark's own `doGenCode` (backed by `java.util.regex.Pattern`) runs inside the " +
+          s"Comet pipeline; this requires ${COMET_SCALA_UDF_CODEGEN_ENABLED.key}=true and " +
+          s"falls back to Spark otherwise. `$REGEXP_ENGINE_RUST` runs the " +
+          "native DataFusion regexp engine when an implementation exists; setting this is " +
+          "itself the opt-in for the semantic differences between Java and Rust regex. " +
+          "Expressions without a native Rust implementation (`regexp_extract`, " +
+          "`regexp_extract_all`, `regexp_instr`) fall through to the JVM codegen dispatcher " +
+          s"under `$REGEXP_ENGINE_RUST` so users still get Comet acceleration with full " +
+          "Spark semantics.")
+      .stringConf
+      .transform(_.toLowerCase(Locale.ROOT))
+      .checkValues(Set(REGEXP_ENGINE_RUST, REGEXP_ENGINE_JAVA))
+      .createWithDefault(REGEXP_ENGINE_JAVA)
+
   val COMET_EXEC_SHUFFLE_WITH_HASH_PARTITIONING_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.native.shuffle.partitioning.hash.enabled")
       .category(CATEGORY_SHUFFLE)

diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala
@@ -313,7 +313,7 @@ object GenerateDocs {
                 annotations += ((fromTypeName, toTypeName, note.trim.replace("(10,2)", "")))
               }
               "C"
-            case Incompatible(notes) =>
+            case Incompatible(notes, _) =>
               notes.filter(_.trim.nonEmpty).foreach { note =>
                 annotations += ((fromTypeName, toTypeName, note.trim.replace("(10,2)", "")))
               }

diff --git a/spark/src/main/scala/org/apache/comet/expressions/RegExp.scala b/spark/src/main/scala/org/apache/comet/expressions/RegExp.scala
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
@@ -723,7 +723,7 @@ case class CometExecRule(session: SparkSession)
         case Unsupported(notes) =>
           withInfo(op, notes.getOrElse(""))
           false
-        case Incompatible(notes) =>
+        case Incompatible(notes, _) =>
           val allowIncompat = CometConf.isOperatorAllowIncompat(opName)
           val incompatConf = CometConf.getOperatorAllowIncompatConfigKey(opName)
           if (allowIncompat) {

diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -186,6 +186,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
       classOf[Like] -> CometLike,
       classOf[Lower] -> CometLower,
       classOf[OctetLength] -> CometScalarFunction("octet_length"),
+      classOf[RegExpExtract] -> CometRegExpExtract,
+      classOf[RegExpExtractAll] -> CometRegExpExtractAll,
+      classOf[RegExpInStr] -> CometRegExpInStr,
       classOf[RegExpReplace] -> CometRegExpReplace,
       classOf[Reverse] -> CometReverse,
       classOf[RLike] -> CometRLike,
@@ -580,23 +583,29 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
           case Unsupported(notes) =>
             withInfo(fn, notes.getOrElse(""))
             None
-          case Incompatible(notes) =>
+          case Incompatible(notes, optedInBy) =>
             val exprAllowIncompat = CometConf.isExprAllowIncompat(exprConfName)
-            if (exprAllowIncompat) {
+            val namedConfOptIn = optedInBy.exists(isOptedInVia)
+            if (exprAllowIncompat || namedConfOptIn) {
               if (notes.isDefined) {
-                logWarning(
-                  s"Comet supports $fn when " +
-                    s"${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true " +
-                    s"but has notes: ${notes.get}")
+                val optInDesc = if (namedConfOptIn) {
+                  optedInBy.get
+                } else {
+                  s"${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true"
+                }
+                logWarning(s"Comet supports $fn when $optInDesc but has notes: ${notes.get}")
               }
               aggHandler.convert(aggExpr, fn, inputs, binding, conf)
             } else {
               val optionalNotes = notes.map(str => s" ($str)").getOrElse("")
+              val extraOptIn = optedInBy
+                .map(kv => s" or by setting $kv")
+                .getOrElse("")
               withInfo(
                 fn,
                 s"$fn is not fully compatible with Spark$optionalNotes. To enable it anyway, " +
-                  s"set ${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true. " +
-                  s"${CometConf.COMPAT_GUIDE}.")
+                  s"set ${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true" +
+                  s"$extraOptIn. ${CometConf.COMPAT_GUIDE}.")
               None
             }
           case Compatible(notes) =>
@@ -672,6 +681,21 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
     exprToProtoInternal(newExpr, inputs, binding)
   }
 
+  /**
+   * True when the current SQLConf has the named config set to the given value. The argument is a
+   * `key=value` string used by `Incompatible.optedInBy` to declare which config opts the user
+   * into running an otherwise-incompatible expression. The configured value is compared
+   * case-insensitively after splitting on the first `=`.
+   */
+  private def isOptedInVia(keyEqualsValue: String): Boolean = {
+    keyEqualsValue.split("=", 2) match {
+      case Array(key, expected) =>
+        Option(SQLConf.get.getConfString(key, null))
+          .exists(_.equalsIgnoreCase(expected))
+      case _ => false
+    }
+  }
+
   /**
    * Convert a Spark expression to a protocol-buffer representation of a native Comet/DataFusion
    * expression.
@@ -705,23 +729,29 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
         case Unsupported(notes) =>
           withInfo(expr, notes.getOrElse(""))
           None
-        case Incompatible(notes) =>
+        case Incompatible(notes, optedInBy) =>
           val exprAllowIncompat = CometConf.isExprAllowIncompat(exprConfName)
-          if (exprAllowIncompat) {
+          val namedConfOptIn = optedInBy.exists(isOptedInVia)
+          if (exprAllowIncompat || namedConfOptIn) {
             if (notes.isDefined) {
-              logWarning(
-                s"Comet supports $expr when " +
-                  s"${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true " +
-                  s"but has notes: ${notes.get}")
+              val optInDesc = if (namedConfOptIn) {
+                optedInBy.get
+              } else {
+                s"${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true"
+              }
+              logWarning(s"Comet supports $expr when $optInDesc but has notes: ${notes.get}")
             }
             handler.convert(expr, inputs, binding)
           } else {
             val optionalNotes = notes.map(str => s" ($str)").getOrElse("")
+            val extraOptIn = optedInBy
+              .map(kv => s" or by setting $kv")
+              .getOrElse("")
             withInfo(
               expr,
               s"$expr is not fully compatible with Spark$optionalNotes. To enable it anyway, " +
-                s"set ${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true. " +
-                s"${CometConf.COMPAT_GUIDE}.")
+                s"set ${CometConf.getExprAllowIncompatConfigKey(exprConfName)}=true" +
+                s"$extraOptIn. ${CometConf.COMPAT_GUIDE}.")
             None
           }
         case Compatible(notes) =>