Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 8 additions & 17 deletions tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"- [`MarkdownToDocument`](https://docs.haystack.deepset.ai/docs/markdowntodocument): This component will help you convert markdown files into Haystack Documents\n",
"- [`PyPDFToDocument`](https://docs.haystack.deepset.ai/docs/pypdftodocument): This component will help you convert pdf files into Haystack Documents\n",
"- [`TextFileToDocument`](https://docs.haystack.deepset.ai/docs/textfiletodocument): This component will help you convert text files into Haystack Documents\n",
"- [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner): This component will help you to join Documents coming from different branches of a pipeline\n",
"- [`DocumentCleaner`](https://docs.haystack.deepset.ai/docs/documentcleaner) (optional): This component will help you to make Documents more readable by removing extra whitespaces etc.\n",
"- [`DocumentSplitter`](https://docs.haystack.deepset.ai/docs/documentsplitter): This component will help you to split your Document into chunks\n",
"- [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder): This component will help you create embeddings for Documents.\n",
Expand Down Expand Up @@ -114,9 +113,7 @@
"\n",
"Next, you'll create a pipeline to index documents. To keep things uncomplicated, you'll use an `InMemoryDocumentStore` but this approach would also work with any other flavor of `DocumentStore`.\n",
"\n",
"You'll need a different file converter class for each file type in our data sources: `.pdf`, `.txt`, and `.md` in this case. Our `FileTypeRouter` connects each file type to the proper converter.\n",
"\n",
"Once all our files have been converted to Haystack Documents, we can use the `DocumentJoiner` component to make these a single list of documents that can be fed through the rest of the indexing pipeline all together."
"You'll need a different file converter class for each file type in our data sources: `.pdf`, `.txt`, and `.md` in this case. Our `FileTypeRouter` connects each file type to the proper converter."
]
},
{
Expand All @@ -131,7 +128,6 @@
"from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument\n",
"from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner\n",
"from haystack.components.routers import FileTypeRouter\n",
"from haystack.components.joiners import DocumentJoiner\n",
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
"from haystack import Pipeline\n",
"from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
Expand All @@ -140,8 +136,7 @@
"file_type_router = FileTypeRouter(mime_types=[\"text/plain\", \"application/pdf\", \"text/markdown\"])\n",
"text_file_converter = TextFileToDocument()\n",
"markdown_converter = MarkdownToDocument()\n",
"pdf_converter = PyPDFToDocument()\n",
"document_joiner = DocumentJoiner()"
"pdf_converter = PyPDFToDocument()"
]
},
{
Expand Down Expand Up @@ -208,7 +203,6 @@
"preprocessing_pipeline.add_component(instance=text_file_converter, name=\"text_file_converter\")\n",
"preprocessing_pipeline.add_component(instance=markdown_converter, name=\"markdown_converter\")\n",
"preprocessing_pipeline.add_component(instance=pdf_converter, name=\"pypdf_converter\")\n",
"preprocessing_pipeline.add_component(instance=document_joiner, name=\"document_joiner\")\n",
"preprocessing_pipeline.add_component(instance=document_cleaner, name=\"document_cleaner\")\n",
"preprocessing_pipeline.add_component(instance=document_splitter, name=\"document_splitter\")\n",
"preprocessing_pipeline.add_component(instance=document_embedder, name=\"document_embedder\")\n",
Expand Down Expand Up @@ -245,7 +239,6 @@
" - text_file_converter: TextFileToDocument\n",
" - markdown_converter: MarkdownToDocument\n",
" - pypdf_converter: PyPDFToDocument\n",
" - document_joiner: DocumentJoiner\n",
" - document_cleaner: DocumentCleaner\n",
" - document_splitter: DocumentSplitter\n",
" - document_embedder: SentenceTransformersDocumentEmbedder\n",
Expand All @@ -254,10 +247,9 @@
" - file_type_router.text/plain -> text_file_converter.sources (List[Union[str, Path, ByteStream]])\n",
" - file_type_router.application/pdf -> pypdf_converter.sources (List[Union[str, Path, ByteStream]])\n",
" - file_type_router.text/markdown -> markdown_converter.sources (List[Union[str, Path, ByteStream]])\n",
" - text_file_converter.documents -> document_joiner.documents (List[Document])\n",
" - markdown_converter.documents -> document_joiner.documents (List[Document])\n",
" - pypdf_converter.documents -> document_joiner.documents (List[Document])\n",
" - document_joiner.documents -> document_cleaner.documents (List[Document])\n",
" - text_file_converter.documents -> document_cleaner.documents (List[Document])\n",
" - pypdf_converter.documents -> document_cleaner.documents (List[Document])\n",
" - markdown_converter.documents -> document_cleaner.documents (List[Document])\n",
" - document_cleaner.documents -> document_splitter.documents (List[Document])\n",
" - document_splitter.documents -> document_embedder.documents (List[Document])\n",
" - document_embedder.documents -> document_writer.documents (List[Document])"
Expand All @@ -272,10 +264,9 @@
"preprocessing_pipeline.connect(\"file_type_router.text/plain\", \"text_file_converter.sources\")\n",
"preprocessing_pipeline.connect(\"file_type_router.application/pdf\", \"pypdf_converter.sources\")\n",
"preprocessing_pipeline.connect(\"file_type_router.text/markdown\", \"markdown_converter.sources\")\n",
"preprocessing_pipeline.connect(\"text_file_converter\", \"document_joiner\")\n",
"preprocessing_pipeline.connect(\"pypdf_converter\", \"document_joiner\")\n",
"preprocessing_pipeline.connect(\"markdown_converter\", \"document_joiner\")\n",
"preprocessing_pipeline.connect(\"document_joiner\", \"document_cleaner\")\n",
"preprocessing_pipeline.connect(\"text_file_converter\", \"document_cleaner\")\n",
"preprocessing_pipeline.connect(\"pypdf_converter\", \"document_cleaner\")\n",
"preprocessing_pipeline.connect(\"markdown_converter\", \"document_cleaner\")\n",
"preprocessing_pipeline.connect(\"document_cleaner\", \"document_splitter\")\n",
"preprocessing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
"preprocessing_pipeline.connect(\"document_embedder\", \"document_writer\")"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"\n",
"- **Level**: Beginner\n",
"- **Time to complete**: 15 minutes\n",
"- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`DocumentLanguageClassifier`](https://docs.haystack.deepset.ai/docs/documentlanguageclassifier), [`MetadataRouter`](https://docs.haystack.deepset.ai/docs/metadatarouter), [`DocumentWriter`](https://docs.haystack.deepset.ai/docs/documentwriter), [`TextLanguageRouter`](https://docs.haystack.deepset.ai/docs/textlanguagerouter), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`ChatPromptBuilder`](https://docs.haystack.deepset.ai/docs/chatpromptbuilder), [`OpenAIChatGenerator`](https://docs.haystack.deepset.ai/docs/openaichatgenerator)\n",
"- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`DocumentLanguageClassifier`](https://docs.haystack.deepset.ai/docs/documentlanguageclassifier), [`MetadataRouter`](https://docs.haystack.deepset.ai/docs/metadatarouter), [`DocumentWriter`](https://docs.haystack.deepset.ai/docs/documentwriter), [`TextLanguageRouter`](https://docs.haystack.deepset.ai/docs/textlanguagerouter), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`ChatPromptBuilder`](https://docs.haystack.deepset.ai/docs/chatpromptbuilder), [`OpenAIChatGenerator`](https://docs.haystack.deepset.ai/docs/openaichatgenerator)\n",
"- **Goal**: After completing this tutorial, you'll have learned how to build a Haystack pipeline to classify documents based on the (human) language they were written in.\n",
"- Optionally, at the end you'll also incorporate language clasification and query routing into a RAG pipeline, so you can query documents based on the language a question was written in."
]
Expand Down Expand Up @@ -385,7 +385,6 @@
"outputs": [],
"source": [
"from haystack.components.retrievers.in_memory import InMemoryBM25Retriever\n",
"from haystack.components.joiners import DocumentJoiner\n",
"from haystack.components.builders import ChatPromptBuilder\n",
"from haystack.components.generators.chat import OpenAIChatGenerator\n",
"from haystack.dataclasses import ChatMessage\n",
Expand Down Expand Up @@ -418,7 +417,6 @@
"Create a new `Pipeline`. Add the following components:\n",
"- `TextLanguageRouter`\n",
"- `InMemoryBM25Retriever`. You'll need a retriever per language, since each language has its own `DocumentStore`.\n",
"- `DocumentJoiner`\n",
"- `ChatPromptBuilder`\n",
"- `OpenAIChatGenerator`\n",
"\n",
Expand All @@ -441,17 +439,15 @@
" - en_retriever: InMemoryBM25Retriever\n",
" - fr_retriever: InMemoryBM25Retriever\n",
" - es_retriever: InMemoryBM25Retriever\n",
" - joiner: DocumentJoiner\n",
" - prompt_builder: ChatPromptBuilder\n",
" - llm: OpenAIChatGenerator\n",
"🛤️ Connections\n",
" - router.en -> en_retriever.query (str)\n",
" - router.fr -> fr_retriever.query (str)\n",
" - router.es -> es_retriever.query (str)\n",
" - en_retriever.documents -> joiner.documents (List[Document])\n",
" - fr_retriever.documents -> joiner.documents (List[Document])\n",
" - es_retriever.documents -> joiner.documents (List[Document])\n",
" - joiner.documents -> prompt_builder.documents (List[Document])\n",
" - en_retriever.documents -> prompt_builder.documents (List[Document])\n",
" - fr_retriever.documents -> prompt_builder.documents (List[Document])\n",
" - es_retriever.documents -> prompt_builder.documents (List[Document])\n",
" - prompt_builder.prompt -> llm.messages (List[ChatMessage])"
]
},
Expand All @@ -466,18 +462,16 @@
"rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=en_document_store), name=\"en_retriever\")\n",
"rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=fr_document_store), name=\"fr_retriever\")\n",
"rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=es_document_store), name=\"es_retriever\")\n",
"rag_pipeline.add_component(instance=DocumentJoiner(), name=\"joiner\")\n",
"rag_pipeline.add_component(instance=ChatPromptBuilder(template=prompt_template), name=\"prompt_builder\")\n",
"rag_pipeline.add_component(instance=OpenAIChatGenerator(), name=\"llm\")\n",
"\n",
"\n",
"rag_pipeline.connect(\"router.en\", \"en_retriever.query\")\n",
"rag_pipeline.connect(\"router.fr\", \"fr_retriever.query\")\n",
"rag_pipeline.connect(\"router.es\", \"es_retriever.query\")\n",
"rag_pipeline.connect(\"en_retriever\", \"joiner\")\n",
"rag_pipeline.connect(\"fr_retriever\", \"joiner\")\n",
"rag_pipeline.connect(\"es_retriever\", \"joiner\")\n",
"rag_pipeline.connect(\"joiner.documents\", \"prompt_builder.documents\")\n",
"rag_pipeline.connect(\"en_retriever\", \"prompt_builder.documents\")\n",
"rag_pipeline.connect(\"fr_retriever\", \"prompt_builder.documents\")\n",
"rag_pipeline.connect(\"es_retriever\", \"prompt_builder.documents\")\n",
"rag_pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")"
]
},
Expand Down
4 changes: 2 additions & 2 deletions tutorials/33_Hybrid_Retrieval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"\n",
"- **Level**: Intermediate\n",
"- **Time to complete**: 15 minutes\n",
"- **Components Used**: [`DocumentSplitter`](https://docs.haystack.deepset.ai/docs/documentsplitter), [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), and [`TransformersSimilarityRanker`](https://docs.haystack.deepset.ai/docs/transformerssimilarityranker)\n",
"- **Components Used**: [`DocumentSplitter`](https://docs.haystack.deepset.ai/docs/documentsplitter), [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder), [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), and [`TransformersSimilarityRanker`](https://docs.haystack.deepset.ai/docs/transformerssimilarityranker)\n",
"- **Prerequisites**: None\n",
"- **Goal**: After completing this tutorial, you will have learned about creating a hybrid retrieval and when it's useful."
]
Expand Down Expand Up @@ -224,7 +224,7 @@
"embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n",
"bm25_retriever = InMemoryBM25Retriever(document_store)"
]
},
},
{
"cell_type": "markdown",
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2180,13 +2180,11 @@
"source": [
"from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n",
"from haystack.components.embedders import SentenceTransformersTextEmbedder\n",
"from haystack.components.joiners import DocumentJoiner\n",
"\n",
"text_router = TransformersTextRouter(model=\"shahrukhx01/bert-mini-finetune-question-detection\")\n",
"text_embedder = SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n",
"bm25_retriever = InMemoryBM25Retriever(document_store)\n",
"document_joiner = DocumentJoiner()"
"bm25_retriever = InMemoryBM25Retriever(document_store)"
]
},
{
Expand Down Expand Up @@ -2220,13 +2218,10 @@
" - text_embedder: SentenceTransformersTextEmbedder\n",
" - embedding_retriever: InMemoryEmbeddingRetriever\n",
" - bm25_retriever: InMemoryBM25Retriever\n",
" - document_joiner: DocumentJoiner\n",
"🛤️ Connections\n",
" - text_router.LABEL_0 -> text_embedder.text (str)\n",
" - text_router.LABEL_1 -> bm25_retriever.query (str)\n",
" - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])\n",
" - embedding_retriever.documents -> document_joiner.documents (List[Document])\n",
" - bm25_retriever.documents -> document_joiner.documents (List[Document])"
" - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])"
]
},
"execution_count": 20,
Expand All @@ -2242,13 +2237,10 @@
"query_classification_pipeline.add_component(\"text_embedder\", text_embedder)\n",
"query_classification_pipeline.add_component(\"embedding_retriever\", embedding_retriever)\n",
"query_classification_pipeline.add_component(\"bm25_retriever\", bm25_retriever)\n",
"query_classification_pipeline.add_component(\"document_joiner\", document_joiner)\n",
"\n",
"query_classification_pipeline.connect(\"text_router.LABEL_0\", \"text_embedder\")\n",
"query_classification_pipeline.connect(\"text_embedder\", \"embedding_retriever\")\n",
"query_classification_pipeline.connect(\"text_router.LABEL_1\", \"bm25_retriever\")\n",
"query_classification_pipeline.connect(\"bm25_retriever\", \"document_joiner\")\n",
"query_classification_pipeline.connect(\"embedding_retriever\", \"document_joiner\")"
"query_classification_pipeline.connect(\"text_router.LABEL_1\", \"bm25_retriever\")"
]
},
{
Expand Down
23 changes: 3 additions & 20 deletions tutorials/43_Building_a_Tool_Calling_Agent.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@
"source": [
"## Using Agent with a Pipeline as Tool\n",
"\n",
"Now, for a more sophisticated example, let's build a research assistant that can search the web, fetch content from links, and generate comprehensive answers. In contrast to our previous Agent, we now want to follow the links on the search engine results page, access their content and parse their content through [OutputAdapter](https://docs.haystack.deepset.ai/docs/outputadapter). We'll start with a Haystack Pipeline that the Agent can use as a tool:"
"Now, for a more sophisticated example, let's build a research assistant that can search the web, fetch content from links, and generate comprehensive answers. In contrast to our previous Agent, we now want to follow the links on the search engine results page, access their content and parse their content through We'll start with a Haystack Pipeline that the Agent can use as a tool:"
]
},
{
Expand All @@ -205,7 +205,6 @@
"source": [
"from haystack.components.builders.answer_builder import AnswerBuilder\n",
"from haystack.components.converters.html import HTMLToDocument\n",
"from haystack.components.converters.output_adapter import OutputAdapter\n",
"from haystack.components.fetchers.link_content import LinkContentFetcher\n",
"from haystack.components.websearch.serper_dev import SerperDevWebSearch\n",
"from haystack.dataclasses import ChatMessage\n",
Expand All @@ -216,25 +215,9 @@
"search_pipeline.add_component(\"search\", SerperDevWebSearch(top_k=10))\n",
"search_pipeline.add_component(\"fetcher\", LinkContentFetcher(timeout=3, raise_on_failure=False, retry_attempts=2))\n",
"search_pipeline.add_component(\"converter\", HTMLToDocument())\n",
"search_pipeline.add_component(\n",
" \"output_adapter\",\n",
" OutputAdapter(\n",
" template=\"\"\"\n",
"{%- for doc in docs -%}\n",
" {%- if doc.content -%}\n",
" <search-result url=\"{{ doc.meta.url }}\">\n",
" {{ doc.content|truncate(25000) }}\n",
" </search-result>\n",
" {%- endif -%}\n",
"{%- endfor -%}\n",
"\"\"\",\n",
" output_type=str,\n",
" ),\n",
")\n",
"\n",
"search_pipeline.connect(\"search.links\", \"fetcher.urls\")\n",
"search_pipeline.connect(\"fetcher.streams\", \"converter.sources\")\n",
"search_pipeline.connect(\"converter.documents\", \"output_adapter.docs\")"
"search_pipeline.connect(\"fetcher.streams\", \"converter.sources\")"
]
},
{
Expand Down Expand Up @@ -270,7 +253,7 @@
"search_component = SuperComponent(\n",
" pipeline=search_pipeline,\n",
" input_mapping={\"query\": [\"search.query\"]},\n",
" output_mapping={\"output_adapter.output\": \"search_result\"},\n",
" output_mapping={\"converter.documents\": \"search_result\"},\n",
")\n",
"\n",
"search_tool = ComponentTool(\n",
Expand Down
Loading