Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
137 commits
Select commit Hold shift + click to select a range
4925446
Refactor hybrid queries to use `alpha_param` and remove `0.7` default…
tsmith023 Mar 13, 2026
1899ffd
Remove mistakenly commited local change to `regen.sh`
tsmith023 Mar 16, 2026
c057270
Update logic to use new proto message
tsmith023 Mar 16, 2026
8e5d54d
Change formatting
tsmith023 Mar 16, 2026
367ce79
Tidy version check code
tsmith023 Mar 16, 2026
81414a1
Parse correct default for BC if server < 1.36
tsmith023 Mar 16, 2026
a13da23
Update CI image
tsmith023 Mar 16, 2026
7abe7f5
Fix wrong version comparison
tsmith023 Mar 16, 2026
b5f5e55
Fix typo in ci
tsmith023 Mar 16, 2026
47841d4
Update CI image
tsmith023 Mar 16, 2026
e305b33
Remove client-side default from aggregate queries
tsmith023 Mar 16, 2026
c4d6e87
Merge pull request #1985 from weaviate/tsmith023/remove-default-hybri…
tsmith023 Mar 17, 2026
007c38a
add selection params
robbespo00 Mar 17, 2026
8520a1c
Update ver check and CI tags
tsmith023 Mar 20, 2026
ed37375
Remove test of lazy loading shards
tsmith023 Mar 20, 2026
454e0e4
Merge branch 'main' into dev/1.37
tsmith023 Mar 20, 2026
1433985
Refactor client test for new server lazy shard loading
tsmith023 Mar 20, 2026
d5d5117
Merge branch 'dev/1.37' of https://github.com/weaviate/weaviate-pytho…
tsmith023 Mar 20, 2026
dd6c835
Debug failing ci test
tsmith023 Mar 20, 2026
cf96ccb
Remove outdated lazy shard load test
tsmith023 Mar 20, 2026
4b669e1
Update CI images
tsmith023 Mar 20, 2026
890a414
Add per-test timeouts and stack dump on timeout
tsmith023 Mar 20, 2026
2893822
Reduce per-test timeout to 5 mins
tsmith023 Mar 20, 2026
6edd0e1
Fix inc backups test
tsmith023 Mar 20, 2026
e7cb697
Add server version check for incremental backups
tsmith023 Mar 20, 2026
12d064f
Remove comment
tsmith023 Mar 20, 2026
fe5c522
Hard kill the process on timeout detection
tsmith023 Mar 20, 2026
2d961d1
Timeout putting the sentinel to avoid deadlocking
tsmith023 Mar 20, 2026
fadcebb
Handle the timeout of sentinel pushing gracefully
tsmith023 Mar 20, 2026
659bf28
Merge remote-tracking branch 'origin/main' into rob/diversity
robbespo00 Mar 26, 2026
78ff5ec
feat: add TextAnalyzerConfig for ASCII folding in text properties
amourao Apr 9, 2026
6931a6f
refactor: ruff format
amourao Apr 9, 2026
bda3008
feat: add min version check
amourao Apr 9, 2026
9bea05a
Merge branch 'main' into dev/1.37
dirkkul Apr 9, 2026
77fc0ff
feat: update TextAnalyzerConfig docstring for ascii_fold attributes
amourao Apr 9, 2026
a8d6927
feat: add asciiFold check in _text_analyzer_from_config function
amourao Apr 9, 2026
e8919a3
test: fix ASCII folding tests
amourao Apr 9, 2026
3cc6306
feat: add support for stopword presets in inverted index configuratio…
amourao Apr 9, 2026
ef04dea
test: added live and config tests
amourao Apr 9, 2026
8f1b33b
refactor: improve docstrings for stopword presets and asciiFold tests
amourao Apr 9, 2026
03d6ff4
refactor: simplify _any_property_has_text_analyzer function using _pr…
amourao Apr 13, 2026
1342204
test: remove redundant insertion ascii fold tests from test_collectio…
amourao Apr 13, 2026
cb53d6a
test: add stopwords roundtrip test for collection configuration
amourao Apr 13, 2026
9de03f3
feat: add model validator to enforce asciiFoldIgnore constraints in T…
amourao Apr 13, 2026
7018927
feat: add factory class for text analyzer configurations with ASCII f…
amourao Apr 13, 2026
8e91984
refactor: update TextAnalyzerConfig usage to new Configure class methods
amourao Apr 13, 2026
30814fc
Merge branch 'feat/ascii-fold' into feat/stopword-presets
amourao Apr 13, 2026
db3009c
test: remove redundant line in stopword presets merge test
amourao Apr 13, 2026
50f7768
refactor: use factory pattern
amourao Apr 13, 2026
6a1b0bc
Add MCP permission
g-despot Apr 14, 2026
a0efe43
refactor: format text analyzer configuration for better readability
amourao Apr 14, 2026
fa92fc2
refactor: remove server side behavior tests
amourao Apr 14, 2026
27cd0a4
test: add stopword presets roundtrip tests for Weaviate collections
amourao Apr 14, 2026
a241d8c
Fix formatting
g-despot Apr 14, 2026
83c2431
refactor: remove unnecessary stopword preset coercion from _TextAnaly…
amourao Apr 14, 2026
4e0a0f2
refactor: replace custom text analyzer method with a direct function …
amourao Apr 14, 2026
eaea155
Merge branch 'dev/1.37' into feat/ascii-fold
amourao Apr 14, 2026
38c7f44
chore: remove unused deprecated import from config.py
amourao Apr 14, 2026
ec43d53
Merge branch 'feat/stopword-presets' into feat/ascii-fold
amourao Apr 14, 2026
b3eb0ac
chore: update WEAVIATE_137 version to 1.37.0-rc.1-578c4eb in workflow
amourao Apr 14, 2026
ceef271
refactor: update text analyzer method to use new static method in Con…
amourao Apr 14, 2026
5e751bf
test: add stopwords roundtrip test with ASCII folding configuration
amourao Apr 14, 2026
31737e9
Merge pull request #2006 from weaviate/feat/ascii-fold
dirkkul Apr 14, 2026
9c4295b
Add query profiling
g-despot Apr 14, 2026
6fd60b5
Reformatted
g-despot Apr 14, 2026
a1df098
Skip test for lower versions
g-despot Apr 14, 2026
239ed32
feat: add tokenizer module with sync and async support, including int…
amourao Apr 14, 2026
480dbe0
Add support for collection export endpoint
dirkkul Mar 9, 2026
92c3d1f
Small cleanup after review
dirkkul Mar 9, 2026
3dc9259
Rename ENum
dirkkul Mar 10, 2026
c36e540
adapt to latest version
dirkkul Mar 31, 2026
54eea32
Update UX
dirkkul Mar 31, 2026
42bfc5c
Remove export path parameter
dirkkul Apr 7, 2026
2c74967
Self-review of changes
dirkkul Apr 13, 2026
ed9f288
Review fixes
dirkkul Apr 13, 2026
e9192f8
Add version guard for export integration tests
dirkkul Apr 13, 2026
338195a
Update to latest image
dirkkul Apr 14, 2026
90840ce
Lowercase export ID
dirkkul Apr 14, 2026
584f8a6
Enforce kwargs for export
dirkkul Apr 15, 2026
96ca193
Fix tests
dirkkul Apr 15, 2026
594e8ee
Fix tests
g-despot Apr 15, 2026
b83a948
Add negative assertions
g-despot Apr 15, 2026
9e8c7b1
Merge branch 'dev/1.37' into query-profiling
g-despot Apr 15, 2026
b9a7c69
Merge pull request #1981 from weaviate/export_collection
dirkkul Apr 15, 2026
8b2caaf
refactor: names don't shadow existing
amourao Apr 15, 2026
ede0b96
fix: add version gate
amourao Apr 15, 2026
8d379f4
refactor: update tokenization type to use Tokenization enum in Tokeni…
amourao Apr 15, 2026
91a359a
refactor: models
amourao Apr 15, 2026
61665e7
refactor: move tokenize property to class config
amourao Apr 15, 2026
3f78571
Merge branch 'dev/1.37' into feat/tokenizer-endpoint
amourao Apr 15, 2026
aea0327
fix: remove trailing whitespace in __init__.py
amourao Apr 15, 2026
ef55ce2
test: add version gate for Weaviate >= 1.37.0 in tokenization tests
amourao Apr 15, 2026
dff05f5
feat: add support for blobHash property type
antas-marcin Mar 14, 2026
1f256b5
Merge pull request #1986 from weaviate/add-support-for-blob-hash-prop…
dirkkul Apr 16, 2026
1b4eea1
Merge pull request #2012 from weaviate/feat/tokenizer-endpoint
dirkkul Apr 16, 2026
906b35b
Add full_with_profile
g-despot Apr 16, 2026
7e5b1be
Merge pull request #2011 from weaviate/query-profiling
g-despot Apr 17, 2026
66a2fb2
Refactor RBAC permissions
g-despot Apr 17, 2026
c116257
Merge branch 'dev/1.37' into mcp-rbac
g-despot Apr 17, 2026
0955364
Bump Weaviate version
g-despot Apr 20, 2026
ca1cb88
Merge pull request #2010 from weaviate/mcp-rbac
tsmith023 Apr 20, 2026
3624e8b
refactor: tokenization executor and models to support stopword config…
amourao Apr 20, 2026
5a12f13
fix: update Weaviate 1.37.1 version to include specific build identifier
amourao Apr 20, 2026
633af0f
Merge branch 'main' of https://github.com/weaviate/weaviate-python-cl…
tsmith023 Apr 20, 2026
7b0042a
Merge branch 'dev/1.37' of https://github.com/weaviate/weaviate-pytho…
tsmith023 Apr 20, 2026
d760577
Reduce timeouts in batch tests
tsmith023 Apr 20, 2026
60887f3
fix: update Weaviate 1.37.1 version to include architecture suffix
amourao Apr 20, 2026
9fd83b8
fix: refactor tokenization tests to use parameterized cases for impro…
amourao Apr 21, 2026
e9d6812
fix: update Weaviate 1.37.1 version and enhance tokenization tests wi…
amourao Apr 21, 2026
202948a
Merge branch 'dev/1.37' into fix/tokenize_simple_output
amourao Apr 21, 2026
959f554
refactor: ruff format
amourao Apr 21, 2026
0f7fe47
test: refactor output types and tests to config
amourao Apr 21, 2026
52c2c8c
refactor: remove unused imports in tokenization models and format
amourao Apr 21, 2026
3de0955
Use public classes for .text endpoint
dirkkul Apr 21, 2026
55b136a
Add overloads for exclusivity of stopwrods
dirkkul Apr 21, 2026
7924e45
Accept collection config classes as stopwords
dirkkul Apr 21, 2026
64bed62
Improve docstring
dirkkul Apr 21, 2026
220e839
Hook up tokenization and clean up model
dirkkul Apr 21, 2026
081aaef
Move property back to tokenization
dirkkul Apr 22, 2026
cae3e33
Merge pull request #2019 from weaviate/fix/tokenize_simple_output
dirkkul Apr 22, 2026
5bc5470
Add integration tests
g-despot Apr 23, 2026
4b44a5d
Merge branch 'dev/1.37' into rob/diversity
g-despot Apr 23, 2026
7ab97c4
Fix stubs and proto version
g-despot Apr 23, 2026
b68160d
Add more tests
g-despot Apr 23, 2026
6017fde
Rename _DiversityMMR to MMR
g-despot Apr 23, 2026
d3651a3
FIx test versions
g-despot Apr 23, 2026
ee1e781
Fix linter issue
g-despot Apr 23, 2026
5a32738
Implement feedback
g-despot Apr 24, 2026
edbcce8
Merge branch 'main' into rob/diversity
g-despot Apr 24, 2026
7f4c031
Rename to DiversitySelection
g-despot Apr 24, 2026
2067620
Fix flake8 error
g-despot Apr 24, 2026
93141ab
Add diversity to hybrid
g-despot Apr 24, 2026
19cf554
Add hybrid tests
g-despot Apr 24, 2026
e88d17e
Remove hybrid support
g-despot Apr 24, 2026
218ee16
Implement feedback
g-despot Apr 24, 2026
33ef2e1
Fix ruff format
g-despot Apr 24, 2026
8f21c33
Add MMR to output module
g-despot Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions integration/test_collection_diversity.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to the comment above/below, do we need all 10 test cases for a 2-parameter config?

Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import pytest

from integration.conftest import CollectionFactory
from weaviate.classes.query import Diversity
from weaviate.collections.classes.config import Configure, DataType, Property
from weaviate.collections.classes.data import DataObject


def _create_clustered_collection(collection_factory: CollectionFactory):
"""Create a collection with 3 tight clusters (a, b, c) of vectors in 3D."""
collection = collection_factory(
properties=[Property(name="text", data_type=DataType.TEXT)],
vectorizer_config=Configure.Vectorizer.none(),
)
if collection._connection._weaviate_version.is_lower_than(1, 37, 0):
pytest.skip("Diversity selection requires Weaviate >= 1.37.0")
collection.data.insert_many(
[
DataObject(properties={"text": "a1"}, vector=[1.0, 0.0, 0.0]),
DataObject(properties={"text": "a2"}, vector=[0.95, 0.05, 0.0]),
DataObject(properties={"text": "a3"}, vector=[0.9, 0.1, 0.0]),
DataObject(properties={"text": "b1"}, vector=[0.0, 1.0, 0.0]),
DataObject(properties={"text": "b2"}, vector=[0.05, 0.95, 0.0]),
DataObject(properties={"text": "c1"}, vector=[0.0, 0.0, 1.0]),
]
)
return collection


def test_near_vector_diversity_selection(collection_factory: CollectionFactory) -> None:
"""Verify that the client passes diversity_selection to the server correctly.

Two orthogonal assertions — server-side logic (MMR itself) is out of scope:
- ``balance`` reaches the server: balance=0.0 produces a different UUID ordering than balance=1.0
- ``limit`` reaches the server: len(result) == mmr_limit
"""
collection = _create_clustered_collection(collection_factory)
mmr_limit = 3

balance_0 = collection.query.near_vector(
near_vector=[1.0, 0.0, 0.0],
diversity_selection=Diversity.mmr(limit=mmr_limit, balance=0.0),
).objects
balance_1 = collection.query.near_vector(
near_vector=[1.0, 0.0, 0.0],
diversity_selection=Diversity.mmr(limit=mmr_limit, balance=1.0),
).objects

# mmr_limit reaches the server → result count equals it
assert len(balance_0) == mmr_limit
assert len(balance_1) == mmr_limit
# balance reaches the server → different ordering
assert [o.uuid for o in balance_0] != [o.uuid for o in balance_1]


def test_near_text_diversity_selection(collection_factory: CollectionFactory) -> None:
"""Smoke test: diversity_selection kwarg is wired through the near_text entry point."""
collection = collection_factory(
properties=[Property(name="name", data_type=DataType.TEXT)],
vectorizer_config=Configure.Vectorizer.text2vec_contextionary(
vectorize_collection_name=False
),
)
if collection._connection._weaviate_version.is_lower_than(1, 37, 0):
pytest.skip("Diversity selection requires Weaviate >= 1.37.0")
for name in ["banana", "apple", "orange", "car", "truck", "bike"]:
collection.data.insert({"name": name})

result = collection.query.near_text(
query="fruit",
diversity_selection=Diversity.mmr(limit=3, balance=0.5),
)
assert len(result.objects) == 3


def test_near_object_diversity_selection(collection_factory: CollectionFactory) -> None:
"""Smoke test: diversity_selection kwarg is wired through the near_object entry point."""
collection = _create_clustered_collection(collection_factory)
anchor = next(iter(collection.query.fetch_objects().objects)).uuid

result = collection.query.near_object(
near_object=anchor,
diversity_selection=Diversity.mmr(limit=3, balance=0.5),
)
assert len(result.objects) == 3


def test_generate_diversity_selection(collection_factory: CollectionFactory) -> None:
"""Smoke test: diversity_selection kwarg is wired through the .generate namespace."""
collection = collection_factory(
properties=[Property(name="name", data_type=DataType.TEXT)],
vectorizer_config=Configure.Vectorizer.text2vec_contextionary(
vectorize_collection_name=False
),
generative_config=Configure.Generative.custom("generative-dummy"),
)
if collection._connection._weaviate_version.is_lower_than(1, 37, 0):
pytest.skip("Diversity selection requires Weaviate >= 1.37.0")
for name in ["banana", "apple", "orange", "car", "truck", "bike"]:
collection.data.insert({"name": name})

result = collection.generate.near_text(
query="fruit",
single_prompt="Describe {name}",
diversity_selection=Diversity.mmr(limit=3, balance=0.5),
)
assert len(result.objects) == 3


def test_diversity_selection_api_surface() -> None:
"""Test the public API surface of Diversity: factory guard + mmr factory method."""
# Direct instantiation of the factory class fails
with pytest.raises(TypeError):
Diversity()

# Diversity.mmr() produces an MMR-configured selection object
assert Diversity.mmr(limit=3, balance=0.5).limit == 3
2 changes: 2 additions & 0 deletions weaviate/classes/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
BM25OperatorFactory as BM25Operator,
)
from weaviate.collections.classes.grpc import (
Diversity,
GroupBy,
HybridFusion,
HybridVector,
Expand All @@ -21,6 +22,7 @@
from weaviate.collections.classes.types import GeoCoordinate

__all__ = [
"Diversity",
"Filter",
"FilterReturn",
"GeoCoordinate",
Expand Down
36 changes: 35 additions & 1 deletion weaviate/collections/classes/grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,36 @@ class Rerank(_WeaviateInput):
query: Optional[str] = Field(default=None)


@dataclass
class MMR:
Comment thread
dirkkul marked this conversation as resolved.
"""Define MMR (Maximal Marginal Relevance) diversity selection.

Args:
limit: Optional number of candidates to consider for diversification.
balance: Optional MMR lambda in [0.0, 1.0] — 1.0 is pure relevance, 0.0 is pure diversity.
"""

limit: Optional[int] = None
balance: Optional[float] = None


class Diversity:
"""Use this factory class to apply diversity selection to search results via MMR."""

def __init__(self) -> None:
raise TypeError("Diversity cannot be instantiated directly. Use Diversity.mmr(...).")

@staticmethod
def mmr(limit: Optional[int] = None, balance: Optional[float] = None) -> MMR:
"""Maximal Marginal Relevance diversity selection.

Args:
limit: Number of candidates to consider for diversification.
balance: MMR lambda in [0.0, 1.0] — 1.0 pure relevance, 0.0 pure diversity.
"""
return MMR(limit=limit, balance=balance)


@dataclass
class BM25OperatorOptions:
# replace with ClassVar[base_search_pb2.SearchOperatorOptions.Operator] once python 3.10 is removed
Expand Down Expand Up @@ -559,7 +589,11 @@ def near_vector(
Returns:
A `_HybridNearVector` object to be used in the `vector` parameter of the `query.hybrid` and `generate.hybrid` search methods.
"""
return _HybridNearVector(vector=vector, distance=distance, certainty=certainty)
return _HybridNearVector(
vector=vector,
distance=distance,
certainty=certainty,
)


class _QueryReference(_WeaviateInput):
Expand Down
21 changes: 19 additions & 2 deletions weaviate/collections/grpc/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from weaviate.collections.classes.config import ConsistencyLevel
from weaviate.collections.classes.filters import FilterReturn
from weaviate.collections.classes.grpc import (
MMR,
PROPERTIES,
PROPERTY,
REFERENCE,
Expand Down Expand Up @@ -262,6 +263,7 @@ def near_vector(
return_metadata: Optional[_MetadataQuery] = None,
return_properties: Union[PROPERTIES, bool, None] = None,
return_references: Optional[REFERENCES] = None,
diversity_selection: Optional[MMR] = None,
) -> search_get_pb2.SearchRequest:
return self.__create_request(
limit=limit,
Expand All @@ -275,7 +277,11 @@ def near_vector(
autocut=autocut,
group_by=group_by,
near_vector=self._parse_near_vector(
near_vector, certainty, distance, target_vector=target_vector
near_vector,
certainty,
distance,
target_vector=target_vector,
diversity_selection=diversity_selection,
),
)

Expand All @@ -296,6 +302,7 @@ def near_object(
return_metadata: Optional[_MetadataQuery] = None,
return_properties: Union[PROPERTIES, bool, None] = None,
return_references: Optional[REFERENCES] = None,
diversity_selection: Optional[MMR] = None,
) -> search_get_pb2.SearchRequest:
return self.__create_request(
limit=limit,
Expand All @@ -308,7 +315,13 @@ def near_object(
rerank=rerank,
autocut=autocut,
group_by=group_by,
near_object=self._parse_near_object(near_object, certainty, distance, target_vector),
near_object=self._parse_near_object(
near_object,
certainty,
distance,
target_vector,
diversity_selection=diversity_selection,
),
)

def near_text(
Expand All @@ -330,6 +343,7 @@ def near_text(
return_metadata: Optional[_MetadataQuery] = None,
return_properties: Union[PROPERTIES, bool, None] = None,
return_references: Optional[REFERENCES] = None,
diversity_selection: Optional[MMR] = None,
) -> search_get_pb2.SearchRequest:
return self.__create_request(
limit=limit,
Expand All @@ -349,6 +363,7 @@ def near_text(
move_away=move_away,
move_to=move_to,
target_vector=target_vector,
diversity_selection=diversity_selection,
),
)

Expand All @@ -370,6 +385,7 @@ def near_media(
return_metadata: Optional[_MetadataQuery] = None,
return_properties: Union[PROPERTIES, bool, None] = None,
return_references: Optional[REFERENCES] = None,
diversity_selection: Optional[MMR] = None,
) -> search_get_pb2.SearchRequest:
return self.__create_request(
limit=limit,
Expand All @@ -388,6 +404,7 @@ def near_media(
certainty,
distance,
target_vector,
diversity_selection=diversity_selection,
),
)

Expand Down
Loading
Loading