From a31ad93ffa89785660fdb63f986f37a37416c2d5 Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Tue, 26 May 2026 15:56:49 +0200 Subject: [PATCH] Invalidate KGClient.cache after writes to prevent stale-data bugs KGClient.cache[uri] was populated on first fetch but never refreshed or invalidated by subsequent writes. After save() PATCHed the server, a following from_id(use_cache=True) would return the original cached document, hiding the change. update_instance, replace_instance, and delete_instance now drop the URI's cache entry on success, so the next fetch goes to the KG. We deliberately do not refresh the cache from the response, because depending on the server's `returnPayload` default the response may be a minimal document rather than the full updated instance. Adds regression tests in test/test_client.py, including an end-to-end test that reproduces the original bug report scenario. Fixes #110. --- fairgraph/client.py | 16 +++++- fairgraph/kgobject.py | 1 - test/test_client.py | 112 ++++++++++++++++++++++++++++++++++++++++++ test/utils.py | 4 ++ 4 files changed, 130 insertions(+), 3 deletions(-) diff --git a/fairgraph/client.py b/fairgraph/client.py index 57cf18ac..59036d0c 100644 --- a/fairgraph/client.py +++ b/fairgraph/client.py @@ -528,7 +528,14 @@ def update_instance(self, instance_id: str, data: JsonLdDocument) -> JsonLdDocum extended_response_configuration=default_response_configuration, ) error_context = f"update_instance(data={data}, instance_id={instance_id})" - return self._check_response(response, error_context=error_context).data + response_data = self._check_response(response, error_context=error_context).data + # The cached document for this URI is now stale. Drop it so the next + # fetch goes to the KG. We deliberately do not refresh the cache from + # the response, because depending on the server's `returnPayload` + # default the response may be a minimal document (e.g. just `@id`) + # rather than the full updated instance. + self.cache.pop(self.uri_from_uuid(instance_id), None) + return response_data def replace_instance(self, instance_id: str, data: JsonLdDocument) -> JsonLdDocument: """ @@ -548,7 +555,10 @@ def replace_instance(self, instance_id: str, data: JsonLdDocument) -> JsonLdDocu extended_response_configuration=default_response_configuration, ) error_context = f"replace_instance(data={data}, instance_id={instance_id})" - return self._check_response(response, error_context=error_context).data + response_data = self._check_response(response, error_context=error_context).data + # See update_instance for rationale: invalidate, don't refresh. + self.cache.pop(self.uri_from_uuid(instance_id), None) + return response_data def delete_instance(self, instance_id: str, ignore_not_found: bool = True, ignore_errors: bool = True): """ @@ -563,6 +573,8 @@ def delete_instance(self, instance_id: str, ignore_not_found: bool = True, ignor if response: # error if not ignore_errors: raise Exception(response.message) + else: + self.cache.pop(self.uri_from_uuid(instance_id), None) return response def uri_from_uuid(self, uuid: str) -> str: diff --git a/fairgraph/kgobject.py b/fairgraph/kgobject.py index 164ba5b5..135fd113 100644 --- a/fairgraph/kgobject.py +++ b/fairgraph/kgobject.py @@ -756,7 +756,6 @@ def save( try: assert self.uuid is not None client.replace_instance(self.uuid, local_data) - # what does this return? Can we use it to update `remote_data`? except AuthorizationError as err: if ignore_auth_errors: logger.error(str(err)) diff --git a/test/test_client.py b/test/test_client.py index ea9894b4..bbc42d19 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -234,3 +234,115 @@ def test_delete_instance(kg_client, mocker): fake_id = "00000000-0000-0000-0000-000000000000" response = kg_client.delete_instance(fake_id) kg_client._kg_client.instances.delete.assert_called_once_with(fake_id) + + +@pytest.fixture +def offline_kg_client(mocker): + """A KGClient that can be constructed without network access, for testing + behaviour that doesn't require a real KG. The underlying kg-core SDK methods + must be patched per-test.""" + from fairgraph.client import KGClient + + client = KGClient(token="fake-token", allow_interactive=False) + # Skip the feature-detection fetch that the `migrated` property triggers. + client._migrated = True + # `instance_from_full_uri` uses this to build the cache key after writes + mocker.patch.object( + client._kg_client.instances._kg_config, + "id_namespace", + "https://kg.ebrains.eu/api/instances/", + create=True, + ) + return client + + +class TestCacheInvalidationOnWrite: + """Regression tests for the bug where writes left stale entries in + `client.cache`, causing subsequent `from_id(use_cache=True)` calls to + return out-of-date data and `save()` to no-op on what looked like a + legitimate modification. See issue #110.""" + + uuid = "00000000-0000-0000-0000-000000000000" + uri = "https://kg.ebrains.eu/api/instances/00000000-0000-0000-0000-000000000000" + + def test_update_instance_invalidates_cache(self, offline_kg_client, mocker): + offline_kg_client.cache[self.uri] = {"@id": self.uri, "stale": True} + mocker.patch.object( + offline_kg_client._kg_client.instances, + "contribute_to_partial_replacement", + lambda **kw: MockKGResponse({"@id": self.uri}), + ) + offline_kg_client.update_instance(self.uuid, {"some": "patch"}) + assert self.uri not in offline_kg_client.cache + + def test_replace_instance_invalidates_cache(self, offline_kg_client, mocker): + offline_kg_client.cache[self.uri] = {"@id": self.uri, "stale": True} + mocker.patch.object( + offline_kg_client._kg_client.instances, + "contribute_to_full_replacement", + lambda **kw: MockKGResponse({"@id": self.uri}), + ) + offline_kg_client.replace_instance(self.uuid, {"some": "data"}) + assert self.uri not in offline_kg_client.cache + + def test_delete_instance_invalidates_cache(self, offline_kg_client, mocker): + offline_kg_client.cache[self.uri] = {"@id": self.uri} + mocker.patch.object(offline_kg_client._kg_client.instances, "delete", return_value=None) + offline_kg_client.delete_instance(self.uuid) + assert self.uri not in offline_kg_client.cache + + def test_unlink_after_refetch_sends_patch(self, offline_kg_client, mocker): + """End-to-end: this is the user-visible bug. Load a DatasetVersion, + link a subject, save; re-load it via `from_id`, set the link back to + `None`, save again — the second save must PATCH studiedSpecimen=None, + not be a silent no-op.""" + from fairgraph.openminds.core import DatasetVersion, Subject + + sub_uri = "https://kg.ebrains.eu/api/instances/00000000-0000-0000-0000-000000000abc" + studied_specimen_path = "https://openminds.om-i.org/props/studiedSpecimen" + # Server-side state of the DSV, mutated by each PATCH so subsequent + # `instance_from_full_uri` calls see fresh data. + server_state = { + "@id": self.uri, + "@type": ["https://openminds.om-i.org/types/DatasetVersion"], + "http://schema.org/identifier": [self.uri], + "https://core.kg.ebrains.eu/vocab/meta/space": "myspace", + } + + def get_by_id(stage, instance_id, extended_response_configuration): + return MockKGResponse(dict(server_state)) + + def contribute_to_partial_replacement(instance_id, payload, extended_response_configuration): + for key, value in payload.items(): + if value is None: + server_state.pop(key, None) + else: + server_state[key] = value + return MockKGResponse(dict(server_state)) + + mocker.patch.object(offline_kg_client._kg_client.instances, "get_by_id", get_by_id) + mocker.patch.object( + offline_kg_client._kg_client.instances, + "contribute_to_partial_replacement", + contribute_to_partial_replacement, + ) + + # 1. Load fresh, link a subject, save. + dsv = DatasetVersion.from_id(self.uuid, offline_kg_client, scope="any") + dsv.studied_specimens = [Subject(id=sub_uri)] + dsv.save(offline_kg_client, space="myspace", recursive=False) + assert studied_specimen_path in server_state, "first save should have linked the subject" + + # 2. Re-fetch via from_id. Before the fix, this would have returned + # stale cached data with no studiedSpecimen. + dsv2 = DatasetVersion.from_id(self.uuid, offline_kg_client, scope="any") + assert dsv2.studied_specimens is not None, ( + "re-fetched object must see the link added by the prior save" + ) + + # 3. Unlink and save. The PATCH must clear studiedSpecimen on the server. + dsv2.studied_specimens = None + dsv2.save(offline_kg_client, space="myspace", recursive=False) + assert studied_specimen_path not in server_state, ( + "second save should have sent a PATCH that cleared studiedSpecimen" + ) diff --git a/test/utils.py b/test/utils.py index d6271547..c8c83f08 100644 --- a/test/utils.py +++ b/test/utils.py @@ -63,6 +63,7 @@ class MockKGClient: def __init__(self): self.instances = {} + self.cache = {} def retrieve_query(self, query_label): return {"@id": f"mock-query-{query_label}"} @@ -175,6 +176,9 @@ def replace_instance(self, instance_id, data): assert instance_id is not None assert data is not None + def uri_from_uuid(self, uuid): + return f"https://kg.ebrains.eu/api/instances/{uuid}" + @pytest.fixture def mock_client():