From 239ed323ccc92dcd1713a5c0b372b7036dd70a79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Tue, 14 Apr 2026 21:03:13 +0100 Subject: [PATCH 1/8] feat: add tokenizer module with sync and async support, including integration tests --- integration/test_tokenize.py | 355 ++++++++++++++++++++++++++++++++++ weaviate/__init__.py | 2 + weaviate/client.py | 3 + weaviate/client.pyi | 3 + weaviate/tokenize/__init__.py | 7 + weaviate/tokenize/async_.py | 8 + weaviate/tokenize/executor.py | 166 ++++++++++++++++ weaviate/tokenize/sync.py | 8 + weaviate/tokenize/types.py | 25 +++ 9 files changed, 577 insertions(+) create mode 100644 integration/test_tokenize.py create mode 100644 weaviate/tokenize/__init__.py create mode 100644 weaviate/tokenize/async_.py create mode 100644 weaviate/tokenize/executor.py create mode 100644 weaviate/tokenize/sync.py create mode 100644 weaviate/tokenize/types.py diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py new file mode 100644 index 000000000..e54f9d49d --- /dev/null +++ b/integration/test_tokenize.py @@ -0,0 +1,355 @@ +"""Integration tests for the tokenize module. + +These tests cover the client's responsibilities: +- Correct serialization of inputs (enums, _TextAnalyzerConfigCreate, kwargs) +- Correct deserialization of responses into typed objects +- Client-side validation (_TextAnalyzerConfigCreate rejects invalid input) +- Both sync and async client paths +""" + +from typing import AsyncGenerator, Generator + +import pytest +import pytest_asyncio + +import weaviate +from weaviate.collections.classes.config import ( + StopwordsConfig, + StopwordsPreset, + TextAnalyzerConfig, + Tokenization, + _StopwordsCreate, + _TextAnalyzerConfigCreate, +) +from weaviate.config import AdditionalConfig +from weaviate.tokenize.types import TokenizeResult + + +@pytest.fixture(scope="module") +def client() -> Generator[weaviate.WeaviateClient, None, None]: + c = weaviate.connect_to_local( + additional_config=AdditionalConfig(timeout=(60, 120)), + ) + yield c + c.close() + + +@pytest_asyncio.fixture +async def async_client() -> AsyncGenerator[weaviate.WeaviateAsyncClient, None]: + c = weaviate.use_async_with_local( + additional_config=AdditionalConfig(timeout=(60, 120)), + ) + await c.connect() + yield c + await c.close() + + +# --------------------------------------------------------------------------- +# Serialization: enums, strings, kwargs, _TextAnalyzerConfigCreate +# --------------------------------------------------------------------------- + + +class TestSerialization: + """Verify the client correctly serializes different input forms.""" + + @pytest.mark.parametrize( + "tokenization,text,expected_tokens", + [ + (Tokenization.WORD, "The quick brown fox", ["the", "quick", "brown", "fox"]), + (Tokenization.LOWERCASE, "Hello World Test", ["hello", "world", "test"]), + (Tokenization.WHITESPACE, "Hello World Test", ["Hello", "World", "Test"]), + (Tokenization.FIELD, " Hello World ", ["Hello World"]), + (Tokenization.TRIGRAM, "Hello", ["hel", "ell", "llo"]), + ], + ) + def test_tokenization_enum( + self, + client: weaviate.WeaviateClient, + tokenization: Tokenization, + text: str, + expected_tokens: list, + ) -> None: + result = client.tokenize.text(text=text, tokenization=tokenization) + assert isinstance(result, TokenizeResult) + assert result.tokenization == tokenization.value + assert result.indexed == expected_tokens + assert result.query == expected_tokens + + def test_tokenization_string(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text(text="hello world", tokenization="word") + assert result.tokenization == "word" + assert result.indexed == ["hello", "world"] + + def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text( + text="The quick brown fox", + tokenization=Tokenization.WORD, + stopword_preset=StopwordsPreset.EN, + ) + assert "the" not in result.query + assert "quick" in result.query + + def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text( + text="The quick brown fox", + tokenization=Tokenization.WORD, + stopword_preset="en", + ) + assert "the" not in result.query + + def test_ascii_fold_via_kwargs(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text( + text="L'école est fermée", + tokenization=Tokenization.WORD, + ascii_fold=True, + ) + assert result.indexed == ["l", "ecole", "est", "fermee"] + + def test_ascii_fold_via_analyzer_config(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(ascii_fold=True) + result = client.tokenize.text( + text="L'école est fermée", + tokenization=Tokenization.WORD, + analyzer_config=cfg, + ) + assert result.indexed == ["l", "ecole", "est", "fermee"] + + def test_analyzer_config_and_kwargs_produce_same_result( + self, client: weaviate.WeaviateClient + ) -> None: + """analyzer_config object and equivalent kwargs must produce identical output.""" + cfg = _TextAnalyzerConfigCreate( + ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN + ) + via_config = client.tokenize.text( + text="L'école est fermée", + tokenization=Tokenization.WORD, + analyzer_config=cfg, + ) + via_kwargs = client.tokenize.text( + text="L'école est fermée", + tokenization=Tokenization.WORD, + ascii_fold=True, + ascii_fold_ignore=["é"], + stopword_preset=StopwordsPreset.EN, + ) + assert via_config.indexed == via_kwargs.indexed + assert via_config.query == via_kwargs.query + + def test_stopword_presets_serialization(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text( + text="hello world test", + tokenization=Tokenization.WORD, + stopword_preset="custom", + stopword_presets={ + "custom": _StopwordsCreate(preset=None, additions=["test"], removals=None), + }, + ) + assert result.indexed == ["hello", "world", "test"] + assert result.query == ["hello", "world"] + + def test_stopword_presets_with_base_and_removals(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text( + text="the quick", + tokenization=Tokenization.WORD, + stopword_preset="en-no-the", + stopword_presets={ + "en-no-the": _StopwordsCreate( + preset=StopwordsPreset.EN, additions=None, removals=["the"] + ), + }, + ) + assert result.indexed == ["the", "quick"] + assert result.query == ["the", "quick"] + + +# --------------------------------------------------------------------------- +# Deserialization: typed response fields +# --------------------------------------------------------------------------- + + +class TestDeserialization: + """Verify the client correctly deserializes response fields into typed objects.""" + + def test_result_type(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text(text="hello", tokenization=Tokenization.WORD) + assert isinstance(result, TokenizeResult) + assert isinstance(result.indexed, list) + assert isinstance(result.query, list) + + def test_analyzer_config_deserialized(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text( + text="L'école", + tokenization=Tokenization.WORD, + ascii_fold=True, + ascii_fold_ignore=["é"], + stopword_preset=StopwordsPreset.EN, + ) + assert isinstance(result.analyzer_config, TextAnalyzerConfig) + assert result.analyzer_config.ascii_fold is True + assert result.analyzer_config.ascii_fold_ignore == ["é"] + assert result.analyzer_config.stopword_preset == "en" + + def test_no_analyzer_config_returns_none(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenize.text(text="hello", tokenization=Tokenization.WORD) + assert result.analyzer_config is None + + def test_stopword_config_deserialized_on_property( + self, client: weaviate.WeaviateClient + ) -> None: + """Property endpoint returns stopwordConfig; verify it deserializes to StopwordsConfig.""" + client.collections.delete("TestDeserStopword") + try: + client.collections.create_from_dict( + { + "class": "TestDeserStopword", + "vectorizer": "none", + "properties": [ + { + "name": "title", + "dataType": ["text"], + "tokenization": "word", + "textAnalyzer": {"stopwordPreset": "en"}, + }, + ], + } + ) + result = client.tokenize.property( + collection_name="TestDeserStopword", + property_name="title", + text="the quick", + ) + assert isinstance(result, TokenizeResult) + assert result.tokenization == "word" + # Stopword config should be deserialized when present + if result.stopword_config is not None: + assert isinstance(result.stopword_config, StopwordsConfig) + finally: + client.collections.delete("TestDeserStopword") + + def test_property_result_types(self, client: weaviate.WeaviateClient) -> None: + client.collections.delete("TestDeserPropTypes") + try: + client.collections.create_from_dict( + { + "class": "TestDeserPropTypes", + "vectorizer": "none", + "properties": [ + { + "name": "tag", + "dataType": ["text"], + "tokenization": "field", + }, + ], + } + ) + result = client.tokenize.property( + collection_name="TestDeserPropTypes", + property_name="tag", + text=" Hello World ", + ) + assert isinstance(result, TokenizeResult) + assert result.tokenization == "field" + assert result.indexed == ["Hello World"] + finally: + client.collections.delete("TestDeserPropTypes") + + +# --------------------------------------------------------------------------- +# Client-side validation (_TextAnalyzerConfigCreate) +# --------------------------------------------------------------------------- + + +class TestClientSideValidation: + """Verify that _TextAnalyzerConfigCreate rejects invalid input before hitting the server.""" + + def test_ascii_fold_ignore_without_fold_raises(self) -> None: + with pytest.raises(ValueError, match="asciiFoldIgnore"): + _TextAnalyzerConfigCreate(ascii_fold=False, ascii_fold_ignore=["é"]) + + def test_ascii_fold_ignore_without_fold_default_raises(self) -> None: + with pytest.raises(ValueError, match="asciiFoldIgnore"): + _TextAnalyzerConfigCreate(ascii_fold_ignore=["é"]) + + def test_valid_config_does_not_raise(self) -> None: + cfg = _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=["é", "ñ"]) + assert cfg.asciiFold is True + assert cfg.asciiFoldIgnore == ["é", "ñ"] + + def test_fold_without_ignore_is_valid(self) -> None: + cfg = _TextAnalyzerConfigCreate(ascii_fold=True) + assert cfg.asciiFold is True + assert cfg.asciiFoldIgnore is None + + def test_stopword_preset_only_is_valid(self) -> None: + cfg = _TextAnalyzerConfigCreate(stopword_preset="en") + assert cfg.stopwordPreset == "en" + + def test_empty_config_is_valid(self) -> None: + cfg = _TextAnalyzerConfigCreate() + assert cfg.asciiFold is None + assert cfg.asciiFoldIgnore is None + assert cfg.stopwordPreset is None + + +# --------------------------------------------------------------------------- +# Async client +# --------------------------------------------------------------------------- + + +class TestAsyncClient: + """Verify both text() and property() work through the async client.""" + + @pytest.mark.asyncio + async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: + result = await async_client.tokenize.text( + text="The quick brown fox", + tokenization=Tokenization.WORD, + ) + assert isinstance(result, TokenizeResult) + assert result.indexed == ["the", "quick", "brown", "fox"] + + @pytest.mark.asyncio + async def test_text_with_analyzer_config( + self, async_client: weaviate.WeaviateAsyncClient + ) -> None: + cfg = _TextAnalyzerConfigCreate(ascii_fold=True, stopword_preset=StopwordsPreset.EN) + result = await async_client.tokenize.text( + text="L'école est fermée", + tokenization=Tokenization.WORD, + analyzer_config=cfg, + ) + assert result.indexed == ["l", "ecole", "est", "fermee"] + assert isinstance(result.analyzer_config, TextAnalyzerConfig) + assert result.analyzer_config.ascii_fold is True + + @pytest.mark.asyncio + async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: + await async_client.collections.delete("TestAsyncPropTokenize") + try: + await async_client.collections.create_from_dict( + { + "class": "TestAsyncPropTokenize", + "vectorizer": "none", + "properties": [ + { + "name": "title", + "dataType": ["text"], + "tokenization": "word", + "textAnalyzer": {"stopwordPreset": "en"}, + }, + ], + } + ) + result = await async_client.tokenize.property( + collection_name="TestAsyncPropTokenize", + property_name="title", + text="The quick brown fox", + ) + assert isinstance(result, TokenizeResult) + assert result.tokenization == "word" + assert result.indexed == ["the", "quick", "brown", "fox"] + assert "the" not in result.query + assert "quick" in result.query + finally: + await async_client.collections.delete("TestAsyncPropTokenize") diff --git a/weaviate/__init__.py b/weaviate/__init__.py index 562b142bc..6fd9368ea 100644 --- a/weaviate/__init__.py +++ b/weaviate/__init__.py @@ -21,6 +21,7 @@ embedded, exceptions, outputs, + tokenize, types, ) from .client import Client, WeaviateAsyncClient, WeaviateClient @@ -67,6 +68,7 @@ "embedded", "exceptions", "outputs", + "tokenize", "types", "use_async_with_custom", "use_async_with_embedded", diff --git a/weaviate/client.py b/weaviate/client.py index d7f9080f4..cbd12be9a 100644 --- a/weaviate/client.py +++ b/weaviate/client.py @@ -22,6 +22,7 @@ from .embedded import EmbeddedOptions from .groups import _Groups, _GroupsAsync from .rbac import _Roles, _RolesAsync +from .tokenize import _Tokenize, _TokenizeAsync from .types import NUMBER from .users import _Users, _UsersAsync @@ -82,6 +83,7 @@ def __init__( self.debug = _DebugAsync(self._connection) self.groups = _GroupsAsync(self._connection) self.roles = _RolesAsync(self._connection) + self.tokenize = _TokenizeAsync(self._connection) self.users = _UsersAsync(self._connection) async def __aenter__(self) -> "WeaviateAsyncClient": @@ -157,6 +159,7 @@ def __init__( self.debug = _Debug(self._connection) self.groups = _Groups(self._connection) self.roles = _Roles(self._connection) + self.tokenize = _Tokenize(self._connection) self.users = _Users(self._connection) def __enter__(self) -> "WeaviateClient": diff --git a/weaviate/client.pyi b/weaviate/client.pyi index 9b32af15f..a6a44f8f7 100644 --- a/weaviate/client.pyi +++ b/weaviate/client.pyi @@ -21,6 +21,7 @@ from .cluster import _Cluster, _ClusterAsync from .collections.batch.client import _BatchClientWrapper, _BatchClientWrapperAsync from .debug import _Debug, _DebugAsync from .rbac import _Roles, _RolesAsync +from .tokenize import _Tokenize, _TokenizeAsync from .types import NUMBER TIMEOUT_TYPE = Union[Tuple[NUMBER, NUMBER], NUMBER] @@ -35,6 +36,7 @@ class WeaviateAsyncClient(_WeaviateClientExecutor[ConnectionAsync]): debug: _DebugAsync groups: _GroupsAsync roles: _RolesAsync + tokenize: _TokenizeAsync users: _UsersAsync async def close(self) -> None: ... @@ -58,6 +60,7 @@ class WeaviateClient(_WeaviateClientExecutor[ConnectionSync]): debug: _Debug groups: _Groups roles: _Roles + tokenize: _Tokenize users: _Users def close(self) -> None: ... diff --git a/weaviate/tokenize/__init__.py b/weaviate/tokenize/__init__.py new file mode 100644 index 000000000..d0c2883c5 --- /dev/null +++ b/weaviate/tokenize/__init__.py @@ -0,0 +1,7 @@ +"""Module for tokenize operations.""" + +from .async_ import _TokenizeAsync +from .sync import _Tokenize +from .types import TokenizeResult + +__all__ = ["_Tokenize", "_TokenizeAsync", "TokenizeResult"] diff --git a/weaviate/tokenize/async_.py b/weaviate/tokenize/async_.py new file mode 100644 index 000000000..a59c392ea --- /dev/null +++ b/weaviate/tokenize/async_.py @@ -0,0 +1,8 @@ +from weaviate.connect import executor +from weaviate.connect.v4 import ConnectionAsync +from weaviate.tokenize.executor import _TokenizeExecutor + + +@executor.wrap("async") +class _TokenizeAsync(_TokenizeExecutor[ConnectionAsync]): + pass diff --git a/weaviate/tokenize/executor.py b/weaviate/tokenize/executor.py new file mode 100644 index 000000000..bd2c24dc1 --- /dev/null +++ b/weaviate/tokenize/executor.py @@ -0,0 +1,166 @@ +"""Tokenize executor.""" + +from typing import Any, Dict, Generic, List, Optional, Union + +from httpx import Response + +from weaviate.collections.classes.config import ( + StopwordsConfig, + StopwordsPreset, + TextAnalyzerConfig, + Tokenization, + _StopwordsCreate, + _TextAnalyzerConfigCreate, +) +from weaviate.connect import executor +from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes +from weaviate.tokenize.types import TokenizeResult + + +def _parse_analyzer_config(body: Dict[str, Any]) -> Optional[TextAnalyzerConfig]: + ac = body.get("analyzerConfig") + if ac is None: + return None + if "asciiFold" not in ac and "stopwordPreset" not in ac: + return None + return TextAnalyzerConfig( + ascii_fold=ac.get("asciiFold", False), + ascii_fold_ignore=ac.get("asciiFoldIgnore"), + stopword_preset=ac.get("stopwordPreset"), + ) + + +def _parse_stopword_config(body: Dict[str, Any]) -> Optional[StopwordsConfig]: + sc = body.get("stopwordConfig") + if sc is None: + return None + return StopwordsConfig( + preset=StopwordsPreset(sc["preset"]) if sc.get("preset") else StopwordsPreset.NONE, + additions=sc.get("additions"), + removals=sc.get("removals"), + ) + + +def _parse_tokenize_result(body: Dict[str, Any]) -> TokenizeResult: + return TokenizeResult( + tokenization=body["tokenization"], + indexed=body["indexed"], + query=body["query"], + analyzer_config=_parse_analyzer_config(body), + stopword_config=_parse_stopword_config(body), + ) + + +class _TokenizeExecutor(Generic[ConnectionType]): + def __init__(self, connection: ConnectionType): + self._connection = connection + + def text( + self, + text: str, + tokenization: Union[Tokenization, str], + *, + analyzer_config: Optional[_TextAnalyzerConfigCreate] = None, + ascii_fold: Optional[bool] = None, + ascii_fold_ignore: Optional[List[str]] = None, + stopword_preset: Optional[Union[StopwordsPreset, str]] = None, + stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None, + ) -> executor.Result[TokenizeResult]: + """Tokenize text using the generic /v1/tokenize endpoint. + + Analyzer settings can be provided either via a ``_TextAnalyzerConfigCreate`` + object **or** via the individual keyword arguments (``ascii_fold``, + ``ascii_fold_ignore``, ``stopword_preset``). If ``analyzer_config`` is + given the individual keyword arguments are ignored. + + Args: + text: The text to tokenize. + tokenization: The tokenization method to use (e.g. Tokenization.WORD). + analyzer_config: A ``_TextAnalyzerConfigCreate`` instance that bundles + ascii_fold, ascii_fold_ignore, and stopword_preset settings. + ascii_fold: Whether to fold accented characters to ASCII equivalents. + ascii_fold_ignore: Characters to exclude from ASCII folding. + stopword_preset: Stopword preset name to apply for query-time filtering. + stopword_presets: Custom stopword preset definitions, keyed by name. + Each value is a ``_StopwordsCreate`` with optional preset, additions, + and removals fields. + + Returns: + A TokenizeResult with indexed and query token lists. + """ + tokenization_str = ( + tokenization.value if isinstance(tokenization, Tokenization) else tokenization + ) + + payload: Dict[str, Any] = { + "text": text, + "tokenization": tokenization_str, + } + + if analyzer_config is not None: + ac_dict = analyzer_config._to_dict() + if ac_dict: + payload["analyzerConfig"] = ac_dict + else: + ac: Dict[str, Any] = {} + if ascii_fold is not None: + ac["asciiFold"] = ascii_fold + if ascii_fold_ignore is not None: + ac["asciiFoldIgnore"] = ascii_fold_ignore + if stopword_preset is not None: + ac["stopwordPreset"] = ( + stopword_preset.value + if isinstance(stopword_preset, StopwordsPreset) + else stopword_preset + ) + if ac: + payload["analyzerConfig"] = ac + + if stopword_presets is not None: + payload["stopwordPresets"] = { + name: cfg._to_dict() for name, cfg in stopword_presets.items() + } + + def resp(response: Response) -> TokenizeResult: + return _parse_tokenize_result(response.json()) + + return executor.execute( + response_callback=resp, + method=self._connection.post, + path="/tokenize", + weaviate_object=payload, + error_msg="Tokenization failed", + status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize text"), + ) + + def property( + self, + collection_name: str, + property_name: str, + text: str, + ) -> executor.Result[TokenizeResult]: + """Tokenize text using a property's configured tokenization settings. + + Args: + collection_name: The collection (class) name. + property_name: The property name whose tokenization config to use. + text: The text to tokenize. + + Returns: + A TokenizeResult with indexed and query token lists. + """ + path = f"/schema/{collection_name}/properties/{property_name}/tokenize" + + payload: Dict[str, Any] = {"text": text} + + def resp(response: Response) -> TokenizeResult: + return _parse_tokenize_result(response.json()) + + return executor.execute( + response_callback=resp, + method=self._connection.post, + path=path, + weaviate_object=payload, + error_msg="Property tokenization failed", + status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize property text"), + ) diff --git a/weaviate/tokenize/sync.py b/weaviate/tokenize/sync.py new file mode 100644 index 000000000..755c42559 --- /dev/null +++ b/weaviate/tokenize/sync.py @@ -0,0 +1,8 @@ +from weaviate.connect import executor +from weaviate.connect.v4 import ConnectionSync +from weaviate.tokenize.executor import _TokenizeExecutor + + +@executor.wrap("sync") +class _Tokenize(_TokenizeExecutor[ConnectionSync]): + pass diff --git a/weaviate/tokenize/types.py b/weaviate/tokenize/types.py new file mode 100644 index 000000000..ba4009b2d --- /dev/null +++ b/weaviate/tokenize/types.py @@ -0,0 +1,25 @@ +"""Return types for tokenize operations.""" + +from dataclasses import dataclass, field +from typing import List, Optional + +from weaviate.collections.classes.config import StopwordsConfig, TextAnalyzerConfig + + +@dataclass +class TokenizeResult: + """Result of a tokenization operation. + + Attributes: + tokenization: The tokenization method that was applied. + indexed: Tokens as they would be stored in the inverted index. + query: Tokens as they would be used for querying (after stopword removal). + analyzer_config: The text analyzer configuration that was used, if any. + stopword_config: The stopword configuration that was used, if any. + """ + + tokenization: str + indexed: List[str] + query: List[str] + analyzer_config: Optional[TextAnalyzerConfig] = field(default=None) + stopword_config: Optional[StopwordsConfig] = field(default=None) From 8b2caaf7356223370b3eba7665d7c5e59c685be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 10:39:17 +0100 Subject: [PATCH 2/8] refactor: names don't shadow existing --- integration/test_tokenize.py | 38 +++++++++---------- weaviate/__init__.py | 4 +- weaviate/client.py | 6 +-- weaviate/client.pyi | 6 +-- weaviate/tokenization/__init__.py | 7 ++++ weaviate/tokenization/async_.py | 8 ++++ .../{tokenize => tokenization}/executor.py | 6 +-- .../types.py => tokenization/models.py} | 0 weaviate/tokenization/sync.py | 8 ++++ weaviate/tokenize/__init__.py | 7 ---- weaviate/tokenize/async_.py | 8 ---- weaviate/tokenize/sync.py | 8 ---- 12 files changed, 53 insertions(+), 53 deletions(-) create mode 100644 weaviate/tokenization/__init__.py create mode 100644 weaviate/tokenization/async_.py rename weaviate/{tokenize => tokenization}/executor.py (97%) rename weaviate/{tokenize/types.py => tokenization/models.py} (100%) create mode 100644 weaviate/tokenization/sync.py delete mode 100644 weaviate/tokenize/__init__.py delete mode 100644 weaviate/tokenize/async_.py delete mode 100644 weaviate/tokenize/sync.py diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index e54f9d49d..b3ecff875 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -22,7 +22,7 @@ _TextAnalyzerConfigCreate, ) from weaviate.config import AdditionalConfig -from weaviate.tokenize.types import TokenizeResult +from weaviate.tokenization.models import TokenizeResult @pytest.fixture(scope="module") @@ -69,19 +69,19 @@ def test_tokenization_enum( text: str, expected_tokens: list, ) -> None: - result = client.tokenize.text(text=text, tokenization=tokenization) + result = client.tokenization.text(text=text, tokenization=tokenization) assert isinstance(result, TokenizeResult) assert result.tokenization == tokenization.value assert result.indexed == expected_tokens assert result.query == expected_tokens def test_tokenization_string(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text(text="hello world", tokenization="word") + result = client.tokenization.text(text="hello world", tokenization="word") assert result.tokenization == "word" assert result.indexed == ["hello", "world"] def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text( + result = client.tokenization.text( text="The quick brown fox", tokenization=Tokenization.WORD, stopword_preset=StopwordsPreset.EN, @@ -90,7 +90,7 @@ def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: assert "quick" in result.query def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text( + result = client.tokenization.text( text="The quick brown fox", tokenization=Tokenization.WORD, stopword_preset="en", @@ -98,7 +98,7 @@ def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None: assert "the" not in result.query def test_ascii_fold_via_kwargs(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text( + result = client.tokenization.text( text="L'école est fermée", tokenization=Tokenization.WORD, ascii_fold=True, @@ -107,7 +107,7 @@ def test_ascii_fold_via_kwargs(self, client: weaviate.WeaviateClient) -> None: def test_ascii_fold_via_analyzer_config(self, client: weaviate.WeaviateClient) -> None: cfg = _TextAnalyzerConfigCreate(ascii_fold=True) - result = client.tokenize.text( + result = client.tokenization.text( text="L'école est fermée", tokenization=Tokenization.WORD, analyzer_config=cfg, @@ -121,12 +121,12 @@ def test_analyzer_config_and_kwargs_produce_same_result( cfg = _TextAnalyzerConfigCreate( ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN ) - via_config = client.tokenize.text( + via_config = client.tokenization.text( text="L'école est fermée", tokenization=Tokenization.WORD, analyzer_config=cfg, ) - via_kwargs = client.tokenize.text( + via_kwargs = client.tokenization.text( text="L'école est fermée", tokenization=Tokenization.WORD, ascii_fold=True, @@ -137,7 +137,7 @@ def test_analyzer_config_and_kwargs_produce_same_result( assert via_config.query == via_kwargs.query def test_stopword_presets_serialization(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text( + result = client.tokenization.text( text="hello world test", tokenization=Tokenization.WORD, stopword_preset="custom", @@ -149,7 +149,7 @@ def test_stopword_presets_serialization(self, client: weaviate.WeaviateClient) - assert result.query == ["hello", "world"] def test_stopword_presets_with_base_and_removals(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text( + result = client.tokenization.text( text="the quick", tokenization=Tokenization.WORD, stopword_preset="en-no-the", @@ -172,13 +172,13 @@ class TestDeserialization: """Verify the client correctly deserializes response fields into typed objects.""" def test_result_type(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text(text="hello", tokenization=Tokenization.WORD) + result = client.tokenization.text(text="hello", tokenization=Tokenization.WORD) assert isinstance(result, TokenizeResult) assert isinstance(result.indexed, list) assert isinstance(result.query, list) def test_analyzer_config_deserialized(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text( + result = client.tokenization.text( text="L'école", tokenization=Tokenization.WORD, ascii_fold=True, @@ -191,7 +191,7 @@ def test_analyzer_config_deserialized(self, client: weaviate.WeaviateClient) -> assert result.analyzer_config.stopword_preset == "en" def test_no_analyzer_config_returns_none(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenize.text(text="hello", tokenization=Tokenization.WORD) + result = client.tokenization.text(text="hello", tokenization=Tokenization.WORD) assert result.analyzer_config is None def test_stopword_config_deserialized_on_property( @@ -214,7 +214,7 @@ def test_stopword_config_deserialized_on_property( ], } ) - result = client.tokenize.property( + result = client.tokenization.for_property( collection_name="TestDeserStopword", property_name="title", text="the quick", @@ -243,7 +243,7 @@ def test_property_result_types(self, client: weaviate.WeaviateClient) -> None: ], } ) - result = client.tokenize.property( + result = client.tokenization.for_property( collection_name="TestDeserPropTypes", property_name="tag", text=" Hello World ", @@ -302,7 +302,7 @@ class TestAsyncClient: @pytest.mark.asyncio async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: - result = await async_client.tokenize.text( + result = await async_client.tokenization.text( text="The quick brown fox", tokenization=Tokenization.WORD, ) @@ -314,7 +314,7 @@ async def test_text_with_analyzer_config( self, async_client: weaviate.WeaviateAsyncClient ) -> None: cfg = _TextAnalyzerConfigCreate(ascii_fold=True, stopword_preset=StopwordsPreset.EN) - result = await async_client.tokenize.text( + result = await async_client.tokenization.text( text="L'école est fermée", tokenization=Tokenization.WORD, analyzer_config=cfg, @@ -341,7 +341,7 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien ], } ) - result = await async_client.tokenize.property( + result = await async_client.tokenization.for_property( collection_name="TestAsyncPropTokenize", property_name="title", text="The quick brown fox", diff --git a/weaviate/__init__.py b/weaviate/__init__.py index 6fd9368ea..2e7e5e58b 100644 --- a/weaviate/__init__.py +++ b/weaviate/__init__.py @@ -21,7 +21,7 @@ embedded, exceptions, outputs, - tokenize, + tokenization, types, ) from .client import Client, WeaviateAsyncClient, WeaviateClient @@ -68,7 +68,7 @@ "embedded", "exceptions", "outputs", - "tokenize", + "tokenization", "types", "use_async_with_custom", "use_async_with_embedded", diff --git a/weaviate/client.py b/weaviate/client.py index cbd12be9a..10ce4c77b 100644 --- a/weaviate/client.py +++ b/weaviate/client.py @@ -22,7 +22,7 @@ from .embedded import EmbeddedOptions from .groups import _Groups, _GroupsAsync from .rbac import _Roles, _RolesAsync -from .tokenize import _Tokenize, _TokenizeAsync +from .tokenization import _Tokenization, _TokenizationAsync from .types import NUMBER from .users import _Users, _UsersAsync @@ -83,7 +83,7 @@ def __init__( self.debug = _DebugAsync(self._connection) self.groups = _GroupsAsync(self._connection) self.roles = _RolesAsync(self._connection) - self.tokenize = _TokenizeAsync(self._connection) + self.tokenization = _TokenizationAsync(self._connection) self.users = _UsersAsync(self._connection) async def __aenter__(self) -> "WeaviateAsyncClient": @@ -159,7 +159,7 @@ def __init__( self.debug = _Debug(self._connection) self.groups = _Groups(self._connection) self.roles = _Roles(self._connection) - self.tokenize = _Tokenize(self._connection) + self.tokenization = _Tokenization(self._connection) self.users = _Users(self._connection) def __enter__(self) -> "WeaviateClient": diff --git a/weaviate/client.pyi b/weaviate/client.pyi index a6a44f8f7..8fafdc3d1 100644 --- a/weaviate/client.pyi +++ b/weaviate/client.pyi @@ -21,7 +21,7 @@ from .cluster import _Cluster, _ClusterAsync from .collections.batch.client import _BatchClientWrapper, _BatchClientWrapperAsync from .debug import _Debug, _DebugAsync from .rbac import _Roles, _RolesAsync -from .tokenize import _Tokenize, _TokenizeAsync +from .tokenization import _Tokenization, _TokenizationAsync from .types import NUMBER TIMEOUT_TYPE = Union[Tuple[NUMBER, NUMBER], NUMBER] @@ -36,7 +36,7 @@ class WeaviateAsyncClient(_WeaviateClientExecutor[ConnectionAsync]): debug: _DebugAsync groups: _GroupsAsync roles: _RolesAsync - tokenize: _TokenizeAsync + tokenization: _TokenizationAsync users: _UsersAsync async def close(self) -> None: ... @@ -60,7 +60,7 @@ class WeaviateClient(_WeaviateClientExecutor[ConnectionSync]): debug: _Debug groups: _Groups roles: _Roles - tokenize: _Tokenize + tokenization: _Tokenization users: _Users def close(self) -> None: ... diff --git a/weaviate/tokenization/__init__.py b/weaviate/tokenization/__init__.py new file mode 100644 index 000000000..2437f7745 --- /dev/null +++ b/weaviate/tokenization/__init__.py @@ -0,0 +1,7 @@ +"""Module for tokenization operations.""" + +from .async_ import _TokenizationAsync +from .sync import _Tokenization +from .models import TokenizeResult + +__all__ = ["_Tokenization", "_TokenizationAsync", "TokenizeResult"] diff --git a/weaviate/tokenization/async_.py b/weaviate/tokenization/async_.py new file mode 100644 index 000000000..5406a39dd --- /dev/null +++ b/weaviate/tokenization/async_.py @@ -0,0 +1,8 @@ +from weaviate.connect import executor +from weaviate.connect.v4 import ConnectionAsync +from weaviate.tokenization.executor import _TokenizationExecutor + + +@executor.wrap("async") +class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]): + pass diff --git a/weaviate/tokenize/executor.py b/weaviate/tokenization/executor.py similarity index 97% rename from weaviate/tokenize/executor.py rename to weaviate/tokenization/executor.py index bd2c24dc1..9ddf5f7ed 100644 --- a/weaviate/tokenize/executor.py +++ b/weaviate/tokenization/executor.py @@ -14,7 +14,7 @@ ) from weaviate.connect import executor from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes -from weaviate.tokenize.types import TokenizeResult +from weaviate.tokenization.models import TokenizeResult def _parse_analyzer_config(body: Dict[str, Any]) -> Optional[TextAnalyzerConfig]: @@ -51,7 +51,7 @@ def _parse_tokenize_result(body: Dict[str, Any]) -> TokenizeResult: ) -class _TokenizeExecutor(Generic[ConnectionType]): +class _TokenizationExecutor(Generic[ConnectionType]): def __init__(self, connection: ConnectionType): self._connection = connection @@ -133,7 +133,7 @@ def resp(response: Response) -> TokenizeResult: status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize text"), ) - def property( + def for_property( self, collection_name: str, property_name: str, diff --git a/weaviate/tokenize/types.py b/weaviate/tokenization/models.py similarity index 100% rename from weaviate/tokenize/types.py rename to weaviate/tokenization/models.py diff --git a/weaviate/tokenization/sync.py b/weaviate/tokenization/sync.py new file mode 100644 index 000000000..ab28cc98e --- /dev/null +++ b/weaviate/tokenization/sync.py @@ -0,0 +1,8 @@ +from weaviate.connect import executor +from weaviate.connect.v4 import ConnectionSync +from weaviate.tokenization.executor import _TokenizationExecutor + + +@executor.wrap("sync") +class _Tokenization(_TokenizationExecutor[ConnectionSync]): + pass diff --git a/weaviate/tokenize/__init__.py b/weaviate/tokenize/__init__.py deleted file mode 100644 index d0c2883c5..000000000 --- a/weaviate/tokenize/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Module for tokenize operations.""" - -from .async_ import _TokenizeAsync -from .sync import _Tokenize -from .types import TokenizeResult - -__all__ = ["_Tokenize", "_TokenizeAsync", "TokenizeResult"] diff --git a/weaviate/tokenize/async_.py b/weaviate/tokenize/async_.py deleted file mode 100644 index a59c392ea..000000000 --- a/weaviate/tokenize/async_.py +++ /dev/null @@ -1,8 +0,0 @@ -from weaviate.connect import executor -from weaviate.connect.v4 import ConnectionAsync -from weaviate.tokenize.executor import _TokenizeExecutor - - -@executor.wrap("async") -class _TokenizeAsync(_TokenizeExecutor[ConnectionAsync]): - pass diff --git a/weaviate/tokenize/sync.py b/weaviate/tokenize/sync.py deleted file mode 100644 index 755c42559..000000000 --- a/weaviate/tokenize/sync.py +++ /dev/null @@ -1,8 +0,0 @@ -from weaviate.connect import executor -from weaviate.connect.v4 import ConnectionSync -from weaviate.tokenize.executor import _TokenizeExecutor - - -@executor.wrap("sync") -class _Tokenize(_TokenizeExecutor[ConnectionSync]): - pass From ede0b96477aa44f5de9fb964e8a48a84e126b408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 10:47:01 +0100 Subject: [PATCH 3/8] fix: add version gate --- integration/test_tokenize.py | 24 ++++++++++++++++++++++++ weaviate/tokenization/executor.py | 17 +++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index b3ecff875..47321aaf5 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -22,6 +22,7 @@ _TextAnalyzerConfigCreate, ) from weaviate.config import AdditionalConfig +from weaviate.exceptions import WeaviateUnsupportedFeatureError from weaviate.tokenization.models import TokenizeResult @@ -292,6 +293,29 @@ def test_empty_config_is_valid(self) -> None: assert cfg.stopwordPreset is None +# --------------------------------------------------------------------------- +# Version gate +# --------------------------------------------------------------------------- + + +class TestVersionGate: + """On Weaviate < 1.37 the client must raise before sending the request.""" + + def test_text_raises_on_old_server(self, client: weaviate.WeaviateClient) -> None: + if client._connection._weaviate_version.is_at_least(1, 37, 0): + pytest.skip("Version gate only applies to Weaviate < 1.37.0") + with pytest.raises(WeaviateUnsupportedFeatureError): + client.tokenization.text(text="hello", tokenization=Tokenization.WORD) + + def test_for_property_raises_on_old_server(self, client: weaviate.WeaviateClient) -> None: + if client._connection._weaviate_version.is_at_least(1, 37, 0): + pytest.skip("Version gate only applies to Weaviate < 1.37.0") + with pytest.raises(WeaviateUnsupportedFeatureError): + client.tokenization.for_property( + collection_name="Any", property_name="title", text="hello" + ) + + # --------------------------------------------------------------------------- # Async client # --------------------------------------------------------------------------- diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 9ddf5f7ed..de3f68061 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -14,6 +14,7 @@ ) from weaviate.connect import executor from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes +from weaviate.exceptions import WeaviateUnsupportedFeatureError from weaviate.tokenization.models import TokenizeResult @@ -55,6 +56,14 @@ class _TokenizationExecutor(Generic[ConnectionType]): def __init__(self, connection: ConnectionType): self._connection = connection + def _check_version(self) -> None: + if self._connection._weaviate_version.is_lower_than(1, 37, 0): + raise WeaviateUnsupportedFeatureError( + "Tokenization", + str(self._connection._weaviate_version), + "1.37.0", + ) + def text( self, text: str, @@ -87,7 +96,11 @@ def text( Returns: A TokenizeResult with indexed and query token lists. + + Raises: + WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. """ + self._check_version() tokenization_str = ( tokenization.value if isinstance(tokenization, Tokenization) else tokenization ) @@ -148,7 +161,11 @@ def for_property( Returns: A TokenizeResult with indexed and query token lists. + + Raises: + WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. """ + self._check_version() path = f"/schema/{collection_name}/properties/{property_name}/tokenize" payload: Dict[str, Any] = {"text": text} From 8d379f4142222f7483d29042a9ec75f92c2e4cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 10:50:22 +0100 Subject: [PATCH 4/8] refactor: update tokenization type to use Tokenization enum in TokenizeResult and related tests --- integration/test_tokenize.py | 10 +++++----- weaviate/tokenization/executor.py | 2 +- weaviate/tokenization/models.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 47321aaf5..24b515f2c 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -72,13 +72,13 @@ def test_tokenization_enum( ) -> None: result = client.tokenization.text(text=text, tokenization=tokenization) assert isinstance(result, TokenizeResult) - assert result.tokenization == tokenization.value + assert result.tokenization == tokenization assert result.indexed == expected_tokens assert result.query == expected_tokens def test_tokenization_string(self, client: weaviate.WeaviateClient) -> None: result = client.tokenization.text(text="hello world", tokenization="word") - assert result.tokenization == "word" + assert result.tokenization == Tokenization.WORD assert result.indexed == ["hello", "world"] def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: @@ -221,7 +221,7 @@ def test_stopword_config_deserialized_on_property( text="the quick", ) assert isinstance(result, TokenizeResult) - assert result.tokenization == "word" + assert result.tokenization == Tokenization.WORD # Stopword config should be deserialized when present if result.stopword_config is not None: assert isinstance(result.stopword_config, StopwordsConfig) @@ -250,7 +250,7 @@ def test_property_result_types(self, client: weaviate.WeaviateClient) -> None: text=" Hello World ", ) assert isinstance(result, TokenizeResult) - assert result.tokenization == "field" + assert result.tokenization == Tokenization.FIELD assert result.indexed == ["Hello World"] finally: client.collections.delete("TestDeserPropTypes") @@ -371,7 +371,7 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien text="The quick brown fox", ) assert isinstance(result, TokenizeResult) - assert result.tokenization == "word" + assert result.tokenization == Tokenization.WORD assert result.indexed == ["the", "quick", "brown", "fox"] assert "the" not in result.query assert "quick" in result.query diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index de3f68061..6228d7350 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -44,7 +44,7 @@ def _parse_stopword_config(body: Dict[str, Any]) -> Optional[StopwordsConfig]: def _parse_tokenize_result(body: Dict[str, Any]) -> TokenizeResult: return TokenizeResult( - tokenization=body["tokenization"], + tokenization=Tokenization(body["tokenization"]), indexed=body["indexed"], query=body["query"], analyzer_config=_parse_analyzer_config(body), diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py index ba4009b2d..ecb01f695 100644 --- a/weaviate/tokenization/models.py +++ b/weaviate/tokenization/models.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from typing import List, Optional -from weaviate.collections.classes.config import StopwordsConfig, TextAnalyzerConfig +from weaviate.collections.classes.config import StopwordsConfig, TextAnalyzerConfig, Tokenization @dataclass @@ -18,7 +18,7 @@ class TokenizeResult: stopword_config: The stopword configuration that was used, if any. """ - tokenization: str + tokenization: Tokenization indexed: List[str] query: List[str] analyzer_config: Optional[TextAnalyzerConfig] = field(default=None) From 91a359a38a56b7b812997b3e8280be6ae1d7b71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 11:04:13 +0100 Subject: [PATCH 5/8] refactor: models --- integration/test_tokenize.py | 91 +++++++++++++++---------------- weaviate/outputs/__init__.py | 15 ++++- weaviate/outputs/tokenization.py | 5 ++ weaviate/tokenization/executor.py | 78 ++------------------------ weaviate/tokenization/models.py | 47 +++++++++++++--- 5 files changed, 108 insertions(+), 128 deletions(-) create mode 100644 weaviate/outputs/tokenization.py diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 24b515f2c..ddd67b656 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -1,9 +1,10 @@ -"""Integration tests for the tokenize module. +"""Integration tests for the tokenization module. These tests cover the client's responsibilities: -- Correct serialization of inputs (enums, _TextAnalyzerConfigCreate, kwargs) +- Correct serialization of inputs (enums, _TextAnalyzerConfigCreate, _StopwordsCreate) - Correct deserialization of responses into typed objects - Client-side validation (_TextAnalyzerConfigCreate rejects invalid input) +- Version gate (>= 1.37.0) - Both sync and async client paths """ @@ -46,7 +47,7 @@ async def async_client() -> AsyncGenerator[weaviate.WeaviateAsyncClient, None]: # --------------------------------------------------------------------------- -# Serialization: enums, strings, kwargs, _TextAnalyzerConfigCreate +# Serialization # --------------------------------------------------------------------------- @@ -76,72 +77,68 @@ def test_tokenization_enum( assert result.indexed == expected_tokens assert result.query == expected_tokens - def test_tokenization_string(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenization.text(text="hello world", tokenization="word") + def test_no_analyzer_config(self, client: weaviate.WeaviateClient) -> None: + result = client.tokenization.text(text="hello world", tokenization=Tokenization.WORD) assert result.tokenization == Tokenization.WORD assert result.indexed == ["hello", "world"] + assert result.analyzer_config is None - def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: + def test_ascii_fold(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(ascii_fold=True) result = client.tokenization.text( - text="The quick brown fox", + text="L'école est fermée", tokenization=Tokenization.WORD, - stopword_preset=StopwordsPreset.EN, + analyzer_config=cfg, ) - assert "the" not in result.query - assert "quick" in result.query + assert result.indexed == ["l", "ecole", "est", "fermee"] - def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None: + def test_ascii_fold_with_ignore(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=["é"]) result = client.tokenization.text( - text="The quick brown fox", + text="L'école est fermée", tokenization=Tokenization.WORD, - stopword_preset="en", + analyzer_config=cfg, ) - assert "the" not in result.query + assert result.indexed == ["l", "école", "est", "fermée"] - def test_ascii_fold_via_kwargs(self, client: weaviate.WeaviateClient) -> None: + def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.EN) result = client.tokenization.text( - text="L'école est fermée", + text="The quick brown fox", tokenization=Tokenization.WORD, - ascii_fold=True, + analyzer_config=cfg, ) - assert result.indexed == ["l", "ecole", "est", "fermee"] + assert "the" not in result.query + assert "quick" in result.query - def test_ascii_fold_via_analyzer_config(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(ascii_fold=True) + def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(stopword_preset="en") result = client.tokenization.text( - text="L'école est fermée", + text="The quick brown fox", tokenization=Tokenization.WORD, analyzer_config=cfg, ) - assert result.indexed == ["l", "ecole", "est", "fermee"] + assert "the" not in result.query - def test_analyzer_config_and_kwargs_produce_same_result( - self, client: weaviate.WeaviateClient - ) -> None: - """analyzer_config object and equivalent kwargs must produce identical output.""" + def test_ascii_fold_combined_with_stopwords(self, client: weaviate.WeaviateClient) -> None: cfg = _TextAnalyzerConfigCreate( ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN ) - via_config = client.tokenization.text( - text="L'école est fermée", + result = client.tokenization.text( + text="The école est fermée", tokenization=Tokenization.WORD, analyzer_config=cfg, ) - via_kwargs = client.tokenization.text( - text="L'école est fermée", - tokenization=Tokenization.WORD, - ascii_fold=True, - ascii_fold_ignore=["é"], - stopword_preset=StopwordsPreset.EN, - ) - assert via_config.indexed == via_kwargs.indexed - assert via_config.query == via_kwargs.query + assert result.indexed == ["the", "école", "est", "fermée"] + assert "the" not in result.query + assert "école" in result.query - def test_stopword_presets_serialization(self, client: weaviate.WeaviateClient) -> None: + def test_stopword_presets_custom_additions(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(stopword_preset="custom") result = client.tokenization.text( text="hello world test", tokenization=Tokenization.WORD, - stopword_preset="custom", + analyzer_config=cfg, stopword_presets={ "custom": _StopwordsCreate(preset=None, additions=["test"], removals=None), }, @@ -150,10 +147,11 @@ def test_stopword_presets_serialization(self, client: weaviate.WeaviateClient) - assert result.query == ["hello", "world"] def test_stopword_presets_with_base_and_removals(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate(stopword_preset="en-no-the") result = client.tokenization.text( text="the quick", tokenization=Tokenization.WORD, - stopword_preset="en-no-the", + analyzer_config=cfg, stopword_presets={ "en-no-the": _StopwordsCreate( preset=StopwordsPreset.EN, additions=None, removals=["the"] @@ -165,7 +163,7 @@ def test_stopword_presets_with_base_and_removals(self, client: weaviate.Weaviate # --------------------------------------------------------------------------- -# Deserialization: typed response fields +# Deserialization # --------------------------------------------------------------------------- @@ -179,12 +177,13 @@ def test_result_type(self, client: weaviate.WeaviateClient) -> None: assert isinstance(result.query, list) def test_analyzer_config_deserialized(self, client: weaviate.WeaviateClient) -> None: + cfg = _TextAnalyzerConfigCreate( + ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN + ) result = client.tokenization.text( text="L'école", tokenization=Tokenization.WORD, - ascii_fold=True, - ascii_fold_ignore=["é"], - stopword_preset=StopwordsPreset.EN, + analyzer_config=cfg, ) assert isinstance(result.analyzer_config, TextAnalyzerConfig) assert result.analyzer_config.ascii_fold is True @@ -198,7 +197,6 @@ def test_no_analyzer_config_returns_none(self, client: weaviate.WeaviateClient) def test_stopword_config_deserialized_on_property( self, client: weaviate.WeaviateClient ) -> None: - """Property endpoint returns stopwordConfig; verify it deserializes to StopwordsConfig.""" client.collections.delete("TestDeserStopword") try: client.collections.create_from_dict( @@ -222,7 +220,6 @@ def test_stopword_config_deserialized_on_property( ) assert isinstance(result, TokenizeResult) assert result.tokenization == Tokenization.WORD - # Stopword config should be deserialized when present if result.stopword_config is not None: assert isinstance(result.stopword_config, StopwordsConfig) finally: @@ -322,7 +319,7 @@ def test_for_property_raises_on_old_server(self, client: weaviate.WeaviateClient class TestAsyncClient: - """Verify both text() and property() work through the async client.""" + """Verify both text() and for_property() work through the async client.""" @pytest.mark.asyncio async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: diff --git a/weaviate/outputs/__init__.py b/weaviate/outputs/__init__.py index 62193fc35..ba3cf894f 100644 --- a/weaviate/outputs/__init__.py +++ b/weaviate/outputs/__init__.py @@ -1,4 +1,16 @@ -from . import aggregate, backup, batch, cluster, config, data, query, replication, tenants, users +from . import ( + aggregate, + backup, + batch, + cluster, + config, + data, + query, + replication, + tenants, + tokenization, + users, +) __all__ = [ "aggregate", @@ -10,5 +22,6 @@ "query", "replication", "tenants", + "tokenization", "users", ] diff --git a/weaviate/outputs/tokenization.py b/weaviate/outputs/tokenization.py new file mode 100644 index 000000000..0854f8b0d --- /dev/null +++ b/weaviate/outputs/tokenization.py @@ -0,0 +1,5 @@ +from weaviate.tokenization.models import TokenizeResult + +__all__ = [ + "TokenizeResult", +] diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 6228d7350..226aeb6c6 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -1,13 +1,10 @@ """Tokenize executor.""" -from typing import Any, Dict, Generic, List, Optional, Union +from typing import Any, Dict, Generic, Optional from httpx import Response from weaviate.collections.classes.config import ( - StopwordsConfig, - StopwordsPreset, - TextAnalyzerConfig, Tokenization, _StopwordsCreate, _TextAnalyzerConfigCreate, @@ -18,40 +15,6 @@ from weaviate.tokenization.models import TokenizeResult -def _parse_analyzer_config(body: Dict[str, Any]) -> Optional[TextAnalyzerConfig]: - ac = body.get("analyzerConfig") - if ac is None: - return None - if "asciiFold" not in ac and "stopwordPreset" not in ac: - return None - return TextAnalyzerConfig( - ascii_fold=ac.get("asciiFold", False), - ascii_fold_ignore=ac.get("asciiFoldIgnore"), - stopword_preset=ac.get("stopwordPreset"), - ) - - -def _parse_stopword_config(body: Dict[str, Any]) -> Optional[StopwordsConfig]: - sc = body.get("stopwordConfig") - if sc is None: - return None - return StopwordsConfig( - preset=StopwordsPreset(sc["preset"]) if sc.get("preset") else StopwordsPreset.NONE, - additions=sc.get("additions"), - removals=sc.get("removals"), - ) - - -def _parse_tokenize_result(body: Dict[str, Any]) -> TokenizeResult: - return TokenizeResult( - tokenization=Tokenization(body["tokenization"]), - indexed=body["indexed"], - query=body["query"], - analyzer_config=_parse_analyzer_config(body), - stopword_config=_parse_stopword_config(body), - ) - - class _TokenizationExecutor(Generic[ConnectionType]): def __init__(self, connection: ConnectionType): self._connection = connection @@ -67,29 +30,17 @@ def _check_version(self) -> None: def text( self, text: str, - tokenization: Union[Tokenization, str], + tokenization: Tokenization, *, analyzer_config: Optional[_TextAnalyzerConfigCreate] = None, - ascii_fold: Optional[bool] = None, - ascii_fold_ignore: Optional[List[str]] = None, - stopword_preset: Optional[Union[StopwordsPreset, str]] = None, stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None, ) -> executor.Result[TokenizeResult]: """Tokenize text using the generic /v1/tokenize endpoint. - Analyzer settings can be provided either via a ``_TextAnalyzerConfigCreate`` - object **or** via the individual keyword arguments (``ascii_fold``, - ``ascii_fold_ignore``, ``stopword_preset``). If ``analyzer_config`` is - given the individual keyword arguments are ignored. - Args: text: The text to tokenize. tokenization: The tokenization method to use (e.g. Tokenization.WORD). - analyzer_config: A ``_TextAnalyzerConfigCreate`` instance that bundles - ascii_fold, ascii_fold_ignore, and stopword_preset settings. - ascii_fold: Whether to fold accented characters to ASCII equivalents. - ascii_fold_ignore: Characters to exclude from ASCII folding. - stopword_preset: Stopword preset name to apply for query-time filtering. + analyzer_config: Text analyzer settings (ASCII folding, stopword preset). stopword_presets: Custom stopword preset definitions, keyed by name. Each value is a ``_StopwordsCreate`` with optional preset, additions, and removals fields. @@ -101,33 +52,16 @@ def text( WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. """ self._check_version() - tokenization_str = ( - tokenization.value if isinstance(tokenization, Tokenization) else tokenization - ) payload: Dict[str, Any] = { "text": text, - "tokenization": tokenization_str, + "tokenization": tokenization.value, } if analyzer_config is not None: ac_dict = analyzer_config._to_dict() if ac_dict: payload["analyzerConfig"] = ac_dict - else: - ac: Dict[str, Any] = {} - if ascii_fold is not None: - ac["asciiFold"] = ascii_fold - if ascii_fold_ignore is not None: - ac["asciiFoldIgnore"] = ascii_fold_ignore - if stopword_preset is not None: - ac["stopwordPreset"] = ( - stopword_preset.value - if isinstance(stopword_preset, StopwordsPreset) - else stopword_preset - ) - if ac: - payload["analyzerConfig"] = ac if stopword_presets is not None: payload["stopwordPresets"] = { @@ -135,7 +69,7 @@ def text( } def resp(response: Response) -> TokenizeResult: - return _parse_tokenize_result(response.json()) + return TokenizeResult.model_validate(response.json()) return executor.execute( response_callback=resp, @@ -171,7 +105,7 @@ def for_property( payload: Dict[str, Any] = {"text": text} def resp(response: Response) -> TokenizeResult: - return _parse_tokenize_result(response.json()) + return TokenizeResult.model_validate(response.json()) return executor.execute( response_callback=resp, diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py index ecb01f695..8bfa508f8 100644 --- a/weaviate/tokenization/models.py +++ b/weaviate/tokenization/models.py @@ -1,13 +1,18 @@ -"""Return types for tokenize operations.""" +"""Return types for tokenization operations.""" -from dataclasses import dataclass, field -from typing import List, Optional +from typing import Any, Dict, List, Optional -from weaviate.collections.classes.config import StopwordsConfig, TextAnalyzerConfig, Tokenization +from pydantic import BaseModel, ConfigDict, Field, field_validator +from weaviate.collections.classes.config import ( + StopwordsConfig, + StopwordsPreset, + TextAnalyzerConfig, + Tokenization, +) -@dataclass -class TokenizeResult: + +class TokenizeResult(BaseModel): """Result of a tokenization operation. Attributes: @@ -18,8 +23,34 @@ class TokenizeResult: stopword_config: The stopword configuration that was used, if any. """ + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) + tokenization: Tokenization indexed: List[str] query: List[str] - analyzer_config: Optional[TextAnalyzerConfig] = field(default=None) - stopword_config: Optional[StopwordsConfig] = field(default=None) + analyzer_config: Optional[TextAnalyzerConfig] = Field(default=None, alias="analyzerConfig") + stopword_config: Optional[StopwordsConfig] = Field(default=None, alias="stopwordConfig") + + @field_validator("analyzer_config", mode="before") + @classmethod + def _parse_analyzer_config(cls, v: Optional[Dict[str, Any]]) -> Optional[TextAnalyzerConfig]: + if v is None: + return None + if "asciiFold" not in v and "stopwordPreset" not in v: + return None + return TextAnalyzerConfig( + ascii_fold=v.get("asciiFold", False), + ascii_fold_ignore=v.get("asciiFoldIgnore"), + stopword_preset=v.get("stopwordPreset"), + ) + + @field_validator("stopword_config", mode="before") + @classmethod + def _parse_stopword_config(cls, v: Optional[Dict[str, Any]]) -> Optional[StopwordsConfig]: + if v is None: + return None + return StopwordsConfig( + preset=StopwordsPreset(v["preset"]), + additions=v.get("additions"), + removals=v.get("removals"), + ) From 61665e712dac3d6e0665b9e5e8e7ae85d8e47144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 11:08:31 +0100 Subject: [PATCH 6/8] refactor: move tokenize property to class config --- integration/test_tokenize.py | 27 +++++++---------- weaviate/collections/config/executor.py | 40 +++++++++++++++++++++++++ weaviate/tokenization/executor.py | 36 ---------------------- 3 files changed, 50 insertions(+), 53 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index ddd67b656..565cb197d 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -213,11 +213,8 @@ def test_stopword_config_deserialized_on_property( ], } ) - result = client.tokenization.for_property( - collection_name="TestDeserStopword", - property_name="title", - text="the quick", - ) + col = client.collections.get("TestDeserStopword") + result = col.config.tokenize_property(property_name="title", text="the quick") assert isinstance(result, TokenizeResult) assert result.tokenization == Tokenization.WORD if result.stopword_config is not None: @@ -241,11 +238,8 @@ def test_property_result_types(self, client: weaviate.WeaviateClient) -> None: ], } ) - result = client.tokenization.for_property( - collection_name="TestDeserPropTypes", - property_name="tag", - text=" Hello World ", - ) + col = client.collections.get("TestDeserPropTypes") + result = col.config.tokenize_property(property_name="tag", text=" Hello World ") assert isinstance(result, TokenizeResult) assert result.tokenization == Tokenization.FIELD assert result.indexed == ["Hello World"] @@ -304,13 +298,12 @@ def test_text_raises_on_old_server(self, client: weaviate.WeaviateClient) -> Non with pytest.raises(WeaviateUnsupportedFeatureError): client.tokenization.text(text="hello", tokenization=Tokenization.WORD) - def test_for_property_raises_on_old_server(self, client: weaviate.WeaviateClient) -> None: + def test_tokenize_property_raises_on_old_server(self, client: weaviate.WeaviateClient) -> None: if client._connection._weaviate_version.is_at_least(1, 37, 0): pytest.skip("Version gate only applies to Weaviate < 1.37.0") + col = client.collections.get("Any") with pytest.raises(WeaviateUnsupportedFeatureError): - client.tokenization.for_property( - collection_name="Any", property_name="title", text="hello" - ) + col.config.tokenize_property(property_name="title", text="hello") # --------------------------------------------------------------------------- @@ -319,7 +312,7 @@ def test_for_property_raises_on_old_server(self, client: weaviate.WeaviateClient class TestAsyncClient: - """Verify both text() and for_property() work through the async client.""" + """Verify text() and tokenize_property() work through the async client.""" @pytest.mark.asyncio async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: @@ -362,8 +355,8 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien ], } ) - result = await async_client.tokenization.for_property( - collection_name="TestAsyncPropTokenize", + col = async_client.collections.get("TestAsyncPropTokenize") + result = await col.config.tokenize_property( property_name="title", text="The quick brown fox", ) diff --git a/weaviate/collections/config/executor.py b/weaviate/collections/config/executor.py index bd302c3c4..9e9390cda 100644 --- a/weaviate/collections/config/executor.py +++ b/weaviate/collections/config/executor.py @@ -56,6 +56,7 @@ WeaviateInvalidInputError, WeaviateUnsupportedFeatureError, ) +from weaviate.tokenization.models import TokenizeResult from weaviate.util import ( _capitalize_first_letter, _decode_json_response_dict, @@ -666,3 +667,42 @@ def resp(res: Response) -> bool: error_msg="Property may not exist", status_codes=_ExpectedStatusCodes(ok_in=[200], error="property exists"), ) + + def tokenize_property( + self, + property_name: str, + text: str, + ) -> executor.Result[TokenizeResult]: + """Tokenize text using a property's configured tokenization settings. + + Args: + property_name: The property name whose tokenization config to use. + text: The text to tokenize. + + Returns: + A TokenizeResult with indexed and query token lists. + + Raises: + WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. + """ + if self._connection._weaviate_version.is_lower_than(1, 37, 0): + raise WeaviateUnsupportedFeatureError( + "Tokenization", + str(self._connection._weaviate_version), + "1.37.0", + ) + + path = f"/schema/{self._name}/properties/{property_name}/tokenize" + payload: Dict[str, Any] = {"text": text} + + def resp(response: Response) -> TokenizeResult: + return TokenizeResult.model_validate(response.json()) + + return executor.execute( + response_callback=resp, + method=self._connection.post, + path=path, + weaviate_object=payload, + error_msg="Property tokenization failed", + status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize property text"), + ) diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 226aeb6c6..5093c14e9 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -79,39 +79,3 @@ def resp(response: Response) -> TokenizeResult: error_msg="Tokenization failed", status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize text"), ) - - def for_property( - self, - collection_name: str, - property_name: str, - text: str, - ) -> executor.Result[TokenizeResult]: - """Tokenize text using a property's configured tokenization settings. - - Args: - collection_name: The collection (class) name. - property_name: The property name whose tokenization config to use. - text: The text to tokenize. - - Returns: - A TokenizeResult with indexed and query token lists. - - Raises: - WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. - """ - self._check_version() - path = f"/schema/{collection_name}/properties/{property_name}/tokenize" - - payload: Dict[str, Any] = {"text": text} - - def resp(response: Response) -> TokenizeResult: - return TokenizeResult.model_validate(response.json()) - - return executor.execute( - response_callback=resp, - method=self._connection.post, - path=path, - weaviate_object=payload, - error_msg="Property tokenization failed", - status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize property text"), - ) From aea03278f3ee5712608589138637d77131364955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 11:16:24 +0100 Subject: [PATCH 7/8] fix: remove trailing whitespace in __init__.py --- weaviate/outputs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weaviate/outputs/__init__.py b/weaviate/outputs/__init__.py index 5381d6dc4..75cb031e0 100644 --- a/weaviate/outputs/__init__.py +++ b/weaviate/outputs/__init__.py @@ -9,7 +9,7 @@ query, replication, tenants, - tokenization, + tokenization, users, ) From ef55ce283b1aae518cb8eacc6dba7fdf6530b709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Wed, 15 Apr 2026 11:55:39 +0100 Subject: [PATCH 8/8] test: add version gate for Weaviate >= 1.37.0 in tokenization tests --- integration/test_tokenize.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 565cb197d..97587235b 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -36,6 +36,12 @@ def client() -> Generator[weaviate.WeaviateClient, None, None]: c.close() +@pytest.fixture(autouse=False) +def require_1_37(client: weaviate.WeaviateClient) -> None: + if client._connection._weaviate_version.is_lower_than(1, 37, 0): + pytest.skip("Tokenization requires Weaviate >= 1.37.0") + + @pytest_asyncio.fixture async def async_client() -> AsyncGenerator[weaviate.WeaviateAsyncClient, None]: c = weaviate.use_async_with_local( @@ -51,6 +57,7 @@ async def async_client() -> AsyncGenerator[weaviate.WeaviateAsyncClient, None]: # --------------------------------------------------------------------------- +@pytest.mark.usefixtures("require_1_37") class TestSerialization: """Verify the client correctly serializes different input forms.""" @@ -167,6 +174,7 @@ def test_stopword_presets_with_base_and_removals(self, client: weaviate.Weaviate # --------------------------------------------------------------------------- +@pytest.mark.usefixtures("require_1_37") class TestDeserialization: """Verify the client correctly deserializes response fields into typed objects.""" @@ -311,6 +319,7 @@ def test_tokenize_property_raises_on_old_server(self, client: weaviate.WeaviateC # --------------------------------------------------------------------------- +@pytest.mark.usefixtures("require_1_37") class TestAsyncClient: """Verify text() and tokenize_property() work through the async client."""