From 7dcfac5787fa25db02a2b90bd32016a7431ad63f Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Mon, 1 Jun 2026 09:23:47 +0800 Subject: [PATCH 1/3] feat(lua): lighten keep_origin provenance --- README.md | 23 ++++++++ lua/qjson/table.lua | 76 +++++++++++++-------------- tests/lua/origin_materialize_spec.lua | 56 +++++++++++++++----- 3 files changed, 104 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 964e2ab..ff2a966 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,29 @@ lazy proxy directly to `cjson.encode` (cjson bypasses metamethods in C); use `qjson.encode` instead, or call `qjson.materialize(t)` to get a plain Lua table that any third-party encoder can handle. +`qjson.materialize(t, { keep_origin = true })` keeps lightweight provenance on +the returned plain Lua tables so `qjson.encode` can preserve key order and +reuse selected original tokens. Recording is intentionally threshold-based: + +- String children are recorded only when their raw JSON token (including + quotes) is longer than 24 bytes. +- Table children are recorded in the parent only when the child origin is + complete and its raw subtree span is longer than 64 bytes. +- Numbers, booleans, null, and short strings are not recorded. + +Each recorded container tracks whether its provenance is complete: + +- `complete = true`: every child needed to prove byte-for-byte identity is + recorded, so an unchanged container can be emitted as the original slice. +- `complete = false`: provenance is partial. Objects still preserve original + key order for existing keys and can reuse recorded large children, but arrays + fall back to normal array/object encoding. + +Because materialized tables are ordinary Lua tables (no dirty-tracking +metatable), `keep_origin` with partial provenance preserves JSON-equivalent +output rather than guaranteeing byte-identical re-emission of every unchanged +small token. + **Native `next` caveat.** `next(t)` is not proxy-aware: it bypasses the `__pairs` / `__ipairs` hooks and may see qjson implementation fields instead of JSON fields. Do not use native `next` to iterate a lazy proxy or test whether it diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index e85dc93..2a23df0 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -30,6 +30,8 @@ local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" }) -- Weak side-table for keep_origin materialization metadata. -- Maps materialized table -> provenance record used by qjson.encode. local TABLE_ORIGIN = setmetatable({}, { __mode = "k" }) +local ORIGIN_STRING_MIN_RAW = 24 +local ORIGIN_TABLE_MIN_RAW = 64 -- Box scratch used for one-shot FFI returns. Reused across calls to avoid -- per-call allocation; safe because the parent Doc / lazy view holds the @@ -556,21 +558,25 @@ local function cursor_raw_token(ctx, cursor) return ctx._doc._hold:sub(bs + 1, be), bs, be end -local function scalar_origin_record(v, raw_token) - if rawequal(v, _M.null) then - return { tag = "null", raw = raw_token } - end +local function origin_child_record(v, raw_token) local tv = type(v) if tv == "string" then - return { tag = "string", value = v, raw = raw_token } - end - if tv == "number" then - return { tag = "number", value = v, raw = raw_token } + if #raw_token > ORIGIN_STRING_MIN_RAW then + return { tag = "string", value = v, raw = raw_token }, true + end + return nil, false end - if tv == "boolean" then - return { tag = "boolean", value = v, raw = raw_token } + if tv == "table" then + local child_origin = TABLE_ORIGIN[v] + if child_origin ~= nil + and child_origin.complete == true + and (child_origin.be - child_origin.bs) > ORIGIN_TABLE_MIN_RAW + then + return { tag = "table", origin = child_origin }, true + end + return nil, false end - return nil + return nil, false end local materialize_with_origin @@ -581,6 +587,7 @@ local function materialize_object_with_origin(view) local records = {} local seen = {} local had_duplicates = false + local complete = true local it = new_object_iter(view) while true do @@ -614,16 +621,16 @@ local function materialize_object_with_origin(view) local materialized_child = materialize_with_origin(child) out[key] = materialized_child - local record = scalar_origin_record(materialized_child, raw_token) - local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil - if record == nil and child_origin ~= nil then - record = { tag = "table", origin = child_origin } + local record, captured = origin_child_record(materialized_child, raw_token) + if not captured then + complete = false end records[key] = record end TABLE_ORIGIN[out] = { kind = "object", + complete = complete, source = view._doc._hold, bs = view._bs, be = view._be, @@ -638,6 +645,7 @@ end local function materialize_array_with_origin(view) local out = {} local records = {} + local complete = true local i = 0 while true do local rc = C.qjson_cursor_index(view._cur, i, child_box) @@ -650,10 +658,9 @@ local function materialize_array_with_origin(view) local materialized_child = materialize_with_origin(child) out[idx] = materialized_child - local record = scalar_origin_record(materialized_child, raw_token) - local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil - if record == nil and child_origin ~= nil then - record = { tag = "table", origin = child_origin } + local record, captured = origin_child_record(materialized_child, raw_token) + if not captured then + complete = false end records[idx] = record i = idx @@ -663,6 +670,7 @@ local function materialize_array_with_origin(view) end TABLE_ORIGIN[out] = { kind = "array", + complete = complete, source = view._doc._hold, bs = view._bs, be = view._be, @@ -973,13 +981,7 @@ local function origin_record_matches(record, value, depth, active) return false end local tag = record.tag - if tag == "null" then - return rawequal(value, _M.null) - elseif tag == "boolean" then - return type(value) == "boolean" and value == record.value - elseif tag == "number" then - return type(value) == "number" and value == record.value - elseif tag == "string" then + if tag == "string" then return type(value) == "string" and value == record.value elseif tag == "table" then if type(value) ~= "table" then @@ -1015,6 +1017,9 @@ local function origin_table_slice(origin) end origin_object_fully_matches = function(t, origin, depth, active) + if origin.complete ~= true then + return false + end if origin.had_duplicates then return false end @@ -1036,6 +1041,9 @@ origin_object_fully_matches = function(t, origin, depth, active) end origin_array_fully_matches = function(t, origin, depth, active) + if origin.complete ~= true then + return false + end if depth > ENCODE_MAX_DEPTH then error(ENCODE_DEPTH_ERROR) end @@ -1067,18 +1075,8 @@ local function encode_origin_child(value, depth, active, record) then return record.raw end - if record.tag == "null" and rawequal(value, _M.null) then - return record.raw - end - if record.tag == "boolean" - and type(value) == "boolean" - and value == record.value - then - return record.raw - end end - -- Numeric scalars intentionally do not reuse raw lexical form when a - -- parent container is being walked; use the normal number encoder. + -- Small scalars and incomplete child tables are re-encoded. return encode(value, depth + 1, active) end @@ -1086,7 +1084,7 @@ local function encode_object_with_origin(t, depth, active, origin) if depth > ENCODE_MAX_DEPTH then error(ENCODE_DEPTH_ERROR) end - if origin_object_fully_matches(t, origin, depth, active) then + if origin.complete == true and origin_object_fully_matches(t, origin, depth, active) then return origin_table_slice(origin) end @@ -1124,7 +1122,7 @@ local function encode_array_with_origin(t, depth, active, origin) if depth > ENCODE_MAX_DEPTH then error(ENCODE_DEPTH_ERROR) end - if origin_array_fully_matches(t, origin, depth, active) then + if origin.complete == true and origin_array_fully_matches(t, origin, depth, active) then return origin_table_slice(origin) end local kind, max = classify_plain_table(t) diff --git a/tests/lua/origin_materialize_spec.lua b/tests/lua/origin_materialize_spec.lua index cd6c9b4..7a11b55 100644 --- a/tests/lua/origin_materialize_spec.lua +++ b/tests/lua/origin_materialize_spec.lua @@ -1,5 +1,7 @@ local qjson = require("qjson") local cjson = require("cjson") +local LONG_ESC_A = "\\u0061\\u0062\\u0063\\u0064\\u0065" +local LONG_ESC_B = "\\u0066\\u0067\\u0068\\u0069\\u006A" describe("qjson.materialize keep_origin", function() it("keeps default materialize semantics when keep_origin is not set", function() @@ -25,11 +27,19 @@ describe("qjson.materialize keep_origin", function() end, "qjson.materialize: opts.keep_origin must be a boolean") end) - it("reuses unchanged escaped string token when parent is changed", function() + it("does not guarantee reuse for short escaped strings when parent is changed", function() local t = qjson.materialize(qjson.decode('{"blob":"\\u0061","x":1}'), { keep_origin = true }) t.x = 2 - assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t)) + assert.are.equal('{"blob":"a","x":2}', qjson.encode(t)) + end) + + it("reuses unchanged escaped string token when raw token is above threshold", function() + local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + t.x = 2 + + assert.are.equal('{"blob":"' .. LONG_ESC_A .. '","x":2}', qjson.encode(t)) end) it("falls back to normal escaping for changed string children", function() @@ -39,15 +49,20 @@ describe("qjson.materialize keep_origin", function() assert.are.equal('{"blob":"line1\\nline2","x":1}', qjson.encode(t)) end) - it("reuses unchanged nested object and array siblings when parent is changed", function() - local src = '{"x":0,"obj":{"k":"\\u0061"},"arr":[1, 2 ,3]}' + it("re-emits small-scalar containers field-by-field when unmodified", function() + local src = '{ "n":1.0, "s":"\\u0061", "b":true, "u":null }' local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) - t.x = 9 - local out = qjson.encode(t) - assert.is_truthy(string.find(out, '"obj":{"k":"\\u0061"}', 1, true)) - assert.is_truthy(string.find(out, '"arr":[1, 2 ,3]', 1, true)) - assert.are.equal(9, cjson.decode(out).x) + + assert.are.equal('{"n":1,"s":"a","b":true,"u":null}', out) + assert.are_not.equal(src, out) + end) + + it("returns original slice for unmodified containers with complete large children", function() + local src = '{ "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + + assert.are.equal(src, qjson.encode(t)) end) it("does not reintroduce duplicate keys after materialization", function() @@ -67,13 +82,20 @@ describe("qjson.materialize keep_origin", function() assert.are.equal('{"n":1,"e":1000,"z":0,"x":2}', qjson.encode(t)) end) - it("does not hide nested table mutations behind a parent raw slice", function() + it("partial origins do not hide nested table mutations behind a parent raw slice", function() local t = qjson.materialize(qjson.decode('{"a":{"x":1},"b":2}'), { keep_origin = true }) t.a.x = 9 assert.are.equal('{"a":{"x":9},"b":2}', qjson.encode(t)) end) + it("falls back to normal array/object classification for incomplete arrays", function() + local src = '[ 1 , 2 , 3 ]' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + + assert.are.equal("[1,2,3]", qjson.encode(t)) + end) + it("still reports circular references after materialization", function() local t = qjson.materialize(qjson.decode('{"a":1}'), { keep_origin = true }) t.self = t @@ -128,13 +150,23 @@ describe("qjson.materialize keep_origin", function() it("keeps source bytes alive for provenance-backed reuse", function() local function materialized() - local src = '{"blob":"\\u0061","x":1}' + local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}' return qjson.materialize(qjson.decode(src), { keep_origin = true }) end local t = materialized() collectgarbage("collect") t.x = 2 - assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t)) + assert.are.equal('{"blob":"' .. LONG_ESC_A .. '","x":2}', qjson.encode(t)) + end) + + it("reuses large complete child subtrees when parent is modified", function() + local src = '{"x":0,"big": { "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }}' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + t.x = 9 + + local out = qjson.encode(t) + assert.are.equal(9, cjson.decode(out).x) + assert.is_truthy(string.find(out, '"big":{ "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }', 1, true)) end) end) From baf513384af9b7536d506ae98fd460a2c89beffe Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Mon, 1 Jun 2026 15:04:10 +0800 Subject: [PATCH 2/3] fix(lua): avoid eager origin token slicing --- lua/qjson/table.lua | 19 +++++------ tests/lua/origin_materialize_spec.lua | 45 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index 2a23df0..381e683 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -550,19 +550,20 @@ local function materialize_plain(v) return v end -local function cursor_raw_token(ctx, cursor) +local function cursor_token_span(ctx, cursor) local rc = C.qjson_cursor_bytes(cursor, sz_a, sz_b) check(ctx, rc) local bs = tonumber(sz_a[0]) local be = tonumber(sz_b[0]) - return ctx._doc._hold:sub(bs + 1, be), bs, be + return bs, be end -local function origin_child_record(v, raw_token) +local function origin_child_record(v, source, bs, be) + local raw_len = be - bs local tv = type(v) if tv == "string" then - if #raw_token > ORIGIN_STRING_MIN_RAW then - return { tag = "string", value = v, raw = raw_token }, true + if raw_len > ORIGIN_STRING_MIN_RAW then + return { tag = "string", value = v, raw = source:sub(bs + 1, be) }, true end return nil, false end @@ -603,7 +604,7 @@ local function materialize_object_with_origin(view) had_duplicates = true end - local raw_token = cursor_raw_token(view, child_box[0]) + local bs, be = cursor_token_span(view, child_box[0]) local child if count == 1 then local cached = cached_child(view, key) @@ -621,7 +622,7 @@ local function materialize_object_with_origin(view) local materialized_child = materialize_with_origin(child) out[key] = materialized_child - local record, captured = origin_child_record(materialized_child, raw_token) + local record, captured = origin_child_record(materialized_child, view._doc._hold, bs, be) if not captured then complete = false end @@ -651,14 +652,14 @@ local function materialize_array_with_origin(view) local rc = C.qjson_cursor_index(view._cur, i, child_box) if rc == QJSON_NOT_FOUND then break end check(view, rc, T_ARR) - local raw_token = cursor_raw_token(view, child_box[0]) + local bs, be = cursor_token_span(view, child_box[0]) local idx = i + 1 local cached = rawget(view, idx) local child = cached or decode_cursor(view, child_box) local materialized_child = materialize_with_origin(child) out[idx] = materialized_child - local record, captured = origin_child_record(materialized_child, raw_token) + local record, captured = origin_child_record(materialized_child, view._doc._hold, bs, be) if not captured then complete = false end diff --git a/tests/lua/origin_materialize_spec.lua b/tests/lua/origin_materialize_spec.lua index 7a11b55..d2f6744 100644 --- a/tests/lua/origin_materialize_spec.lua +++ b/tests/lua/origin_materialize_spec.lua @@ -2,6 +2,23 @@ local qjson = require("qjson") local cjson = require("cjson") local LONG_ESC_A = "\\u0061\\u0062\\u0063\\u0064\\u0065" local LONG_ESC_B = "\\u0066\\u0067\\u0068\\u0069\\u006A" +local EXACT_24_ESC = "\\u0061\\u0062abcdefghij" +local EXACT_64_CHILD_VALUE = string.rep("a", 56) + +local function count_string_sub_calls(fn) + local original = string.sub + local calls = 0 + string.sub = function(...) + calls = calls + 1 + return original(...) + end + local ok, err = pcall(fn) + string.sub = original + if not ok then + error(err, 0) + end + return calls +end describe("qjson.materialize keep_origin", function() it("keeps default materialize semantics when keep_origin is not set", function() @@ -34,6 +51,24 @@ describe("qjson.materialize keep_origin", function() assert.are.equal('{"blob":"a","x":2}', qjson.encode(t)) end) + it("does not slice raw tokens for dropped provenance records", function() + local doc = qjson.decode('{"n":1,"b":true,"u":null,"s":"x","arr":[1,2],"obj":{"x":1}}') + local sub_calls = count_string_sub_calls(function() + qjson.materialize(doc, { keep_origin = true }) + end) + + assert.are.equal(0, sub_calls) + end) + + it("does not treat an exact 24-byte string token as above threshold", function() + assert.are.equal(24, #('"' .. EXACT_24_ESC .. '"')) + + local t = qjson.materialize(qjson.decode('{"blob":"' .. EXACT_24_ESC .. '","x":1}'), { keep_origin = true }) + t.x = 2 + + assert.are.equal('{"blob":"ababcdefghij","x":2}', qjson.encode(t)) + end) + it("reuses unchanged escaped string token when raw token is above threshold", function() local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}' local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) @@ -65,6 +100,16 @@ describe("qjson.materialize keep_origin", function() assert.are.equal(src, qjson.encode(t)) end) + it("does not treat an exact 64-byte child container as above threshold", function() + local child = '{"a":"' .. EXACT_64_CHILD_VALUE .. '"}' + assert.are.equal(64, #child) + + local src = '{ "child" : ' .. child .. ' }' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + + assert.are.equal('{"child":' .. child .. '}', qjson.encode(t)) + end) + it("does not reintroduce duplicate keys after materialization", function() local t = qjson.materialize(qjson.decode('{"a":1,"a":2}'), { keep_origin = true }) t.b = 3 From 10453197bab8057587455726efcceec30d4e94a3 Mon Sep 17 00:00:00 2001 From: Yuansheng Wang Date: Mon, 1 Jun 2026 15:08:53 +0800 Subject: [PATCH 3/3] test(lua): keep origin allocation check lint-clean --- tests/lua/origin_materialize_spec.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/lua/origin_materialize_spec.lua b/tests/lua/origin_materialize_spec.lua index d2f6744..536d935 100644 --- a/tests/lua/origin_materialize_spec.lua +++ b/tests/lua/origin_materialize_spec.lua @@ -8,12 +8,12 @@ local EXACT_64_CHILD_VALUE = string.rep("a", 56) local function count_string_sub_calls(fn) local original = string.sub local calls = 0 - string.sub = function(...) + rawset(string, "sub", function(...) calls = calls + 1 return original(...) - end + end) local ok, err = pcall(fn) - string.sub = original + rawset(string, "sub", original) if not ok then error(err, 0) end