add part 5 materials

2026-07-01 21:24:03 -04:00 · 2023-09-11 15:58:11 -06:00
parent aadd139d99
commit aee003cf61
5 changed files with 1407 additions and 140 deletions
@@ -0,0 +1,45 @@
+import os
+
+from llama_index import ServiceContext, set_global_service_context
+from llama_index.llms import OpenAI
+from llama_index.query_engine.router_query_engine import RouterQueryEngine
+from indexing import get_query_engine_tool
+
+# setup a global service context
+llm = OpenAI(model="gpt-3.5-turbo-16k", temperature=0, max_tokens=512)
+# embed_model = OpenAIEmbedding(embed_batch_size=50)
+embed_model = "local:BAAI/bge-base-en"  # use a local model for embeddings
+
+service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
+set_global_service_context(service_context)
+
+
+docs_directories = {
+    "../docs/community": "Useful for information on community integrations with other libraries, vector dbs, and frameworks.", 
+    "../docs/core_modules/agent_modules": "Useful for information on data agents and tools for data agents.", 
+    "../docs/core_modules/data_modules": "Useful for information on data, storage, indexing, and data processing modules.",
+    "../docs/core_modules/model_modules": "Useful for information on LLMs, embedding models, and prompts.",
+    "../docs/core_modules/query_modules": "Useful for information on various query engines and retrievers, and anything related to querying data.",
+    "../docs/core_modules/supporting_modules": "Useful for information on supporting modules, like callbacks, evaluators, and other supporting modules.",
+    "../docs/getting_started": "Useful for information on getting started with LlamaIndex.", 
+    "../docs/development": "Useful for information on contributing to LlamaIndex development.",
+}
+
+# Build query engine tools
+query_engine_tools = [
+    get_query_engine_tool(directory, description) for directory, description in docs_directories.items()
+]
+
+
+query_engine = RouterQueryEngine.from_defaults(
+    query_engine_tools=query_engine_tools,
+    select_multi=True,
+    service_context=service_context,
+)
+
+while True:
+    input_text = input("Enter a query: ").strip()
+    input_text += "\nInclude relevant links from the context when it makes sense."
+    response = query_engine.query(input_text)
+    print(str(response))
+    print("\n")
@@ -0,0 +1,28 @@
+from typing import Callable, Optional
+
+from llama_index.bridge.pydantic import PrivateAttr
+from llama_index.indices.postprocessor.types import BaseNodePostprocessor
+from llama_index.utils import globals_helper
+from llama_index.schema import MetadataMode
+
+class LimitRetrievedNodesLength(BaseNodePostprocessor):
+    limit: int = 3000
+    _tokenizer: Callable = PrivateAttr()
+
+    def __init__(self, limit: int = 3000, tokenizer: Optional[Callable] = None):
+        self._tokenizer = tokenizer or globals_helper.tokenizer
+        super().__init__(
+            limit=limit,
+        )
+
+    def postprocess_nodes(self, nodes, query_bundle):
+        included_nodes = []
+        current_length = 0
+
+        for node in nodes:
+            current_length += len(self._tokenizer(node.node.get_content(metadata_mode=MetadataMode.LLM)))
+            if current_length > self.limit:
+                break
+            included_nodes.append(node)
+
+        return included_nodes
@@ -1,20 +1,23 @@
+import os
 import nest_asyncio
 nest_asyncio.apply()

 from .markdown_docs_reader import MarkdownDocsReader
 from llama_index import (
    SimpleDirectoryReader, 
-    VectorStoreIndex, 
-    ServiceContext, 
+    VectorStoreIndex,
    StorageContext, 
-    load_index_from_storage, 
-    set_global_service_context
+    load_index_from_storage
 )
-from llama_index.query_engine import SubQuestionQueryEngine
-from llama_index.response_synthesizers import get_response_synthesizer
-from llama_index.tools import QueryEngineTool
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.node_parser import HierarchicalNodeParser, get_leaf_nodes
+from llama_index.retrievers import AutoMergingRetriever
+from llama_index.schema import Document, MetadataMode
+from llama_index.storage.docstore import SimpleDocumentStore
+from llama_index.tools import QueryEngineTool, ToolMetadata 


+# load documents
 def load_markdown_docs(filepath):
    """Load markdown docs from a directory, excluding all other file types."""
    loader = SimpleDirectoryReader(
@@ -26,149 +29,58 @@ def load_markdown_docs(filepath):

    documents = loader.load_data()

-    # exclude some metadata from the LLM
-    for doc in documents:
-        doc.excluded_llm_metadata_keys = ["File Name", "Content Type", "Header Path"]
+    # combine all documents into one
+    documents = [
+        Document(text="\n\n".join(
+                document.get_content(metadata_mode=MetadataMode.ALL) 
+                for document in documents
+            )
+        )
+    ]

-    return documents
-
-
-def load_docs():
-    getting_started_docs = load_markdown_docs("../docs/getting_started")
-    community_docs = load_markdown_docs("../docs/community")
-    data_docs = load_markdown_docs("../docs/core_modules/data_modules")
-    agent_docs = load_markdown_docs("../docs/core_modules/agent_modules")
-    model_docs = load_markdown_docs("../docs/core_modules/model_modules")
-    query_docs = load_markdown_docs("../docs/core_modules/query_modules")
-    supporting_docs = load_markdown_docs("../docs/core_modules/supporting_modules")
-    tutorials_docs = load_markdown_docs("../docs/end_to_end_tutorials")
-    contributing_docs = load_markdown_docs("../docs/development")
-
-    return (
-        getting_started_docs,
-        community_docs,
-        data_docs,
-        agent_docs,
-        model_docs,
-        query_docs,
-        supporting_docs,
-        tutorials_docs,
-        contributing_docs,
+    # chunk into 3 levels
+    # majority means 2/3 are retrieved before using the parent
+    large_chunk_size = 1536
+    node_parser = HierarchicalNodeParser.from_defaults(
+        chunk_sizes=[
+            large_chunk_size, 
+            large_chunk_size // 3,
+        ]
    )

+    nodes = node_parser.get_nodes_from_documents(documents)
+    return nodes, get_leaf_nodes(nodes)

-def create_query_engine():
-    """Create a query engine."""
-    getting_started_docs, community_docs, data_docs, agent_docs, model_docs, query_docs, supporting_docs, tutorials_docs, contributing_docs = load_docs()

+def get_query_engine_tool(directory, description, postprocessors=None):
    try:
-        getting_started_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./getting_started_index"))
-        community_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./community_index"))
-        data_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./data_index"))
-        agent_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./agent_index"))
-        model_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./model_index"))
-        query_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./query_index"))
-        supporting_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./supporting_index"))
-        tutorials_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./tutorials_index"))
-        contributing_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./contributing_index"))
-    except Exception:
-        getting_started_index = VectorStoreIndex.from_documents(getting_started_docs)
-        getting_started_index.storage_context.persist(persist_dir="./getting_started_index")
+        storage_context = StorageContext.from_defaults(
+            persist_dir=f"./data_{os.path.basename(directory)}"
+        )
+        index = load_index_from_storage(storage_context)

-        community_index = VectorStoreIndex.from_documents(community_docs)
-        community_index.storage_context.persist(persist_dir="./community_index")
+        retriever = AutoMergingRetriever(
+            index.as_retriever(similarity_top_k=12), 
+            storage_context=storage_context
+        )
+    except:
+        nodes, leaf_nodes = load_markdown_docs(directory)

-        data_index = VectorStoreIndex.from_documents(data_docs)
-        data_index.storage_context.persist(persist_dir="./data_index")
+        docstore = SimpleDocumentStore()
+        docstore.add_documents(nodes)
+        storage_context = StorageContext.from_defaults(docstore=docstore)

-        agent_index = VectorStoreIndex.from_documents(agent_docs)
-        agent_index.storage_context.persist(persist_dir="./agent_index")
+        index = VectorStoreIndex(leaf_nodes, storage_context=storage_context)
+        index.storage_context.persist(persist_dir=f"./data_{os.path.basename(directory)}")

-        model_index = VectorStoreIndex.from_documents(model_docs)
-        model_index.storage_context.persist(persist_dir="./model_index")
+        retriever = AutoMergingRetriever(
+            index.as_retriever(similarity_top_k=12), 
+            storage_context=storage_context
+        )

-        query_index = VectorStoreIndex.from_documents(query_docs)
-        query_index.storage_context.persist(persist_dir="./query_index")    
-
-        supporting_index = VectorStoreIndex.from_documents(supporting_docs)
-        supporting_index.storage_context.persist(persist_dir="./supporting_index")
-
-        tutorials_index = VectorStoreIndex.from_documents(tutorials_docs)
-        tutorials_index.storage_context.persist(persist_dir="./tutorials_index")
-
-        contributing_index = VectorStoreIndex.from_documents(contributing_docs)
-        contributing_index.storage_context.persist(persist_dir="./contributing_index")
-
-    # create a query engine tool for each folder
-    getting_started_tool = QueryEngineTool.from_defaults(
-        query_engine=getting_started_index.as_query_engine(), 
-        name="Getting Started", 
-        description="Useful for answering questions about installing and running llama index, as well as basic explanations of how llama index works."
+    query_engine = RetrieverQueryEngine.from_args(
+        retriever,
+        node_postprocessors=postprocessors or [],
    )

-    community_tool = QueryEngineTool.from_defaults(
-        query_engine=community_index.as_query_engine(),
-        name="Community",
-        description="Useful for answering questions about integrations and other apps built by the community."
-    )
-
-    data_tool = QueryEngineTool.from_defaults(
-        query_engine=data_index.as_query_engine(),
-        name="Data Modules",
-        description="Useful for answering questions about data loaders, documents, nodes, and index structures."
-    )
-
-    agent_tool = QueryEngineTool.from_defaults(
-        query_engine=agent_index.as_query_engine(),
-        name="Agent Modules",
-        description="Useful for answering questions about data agents, agent configurations, and tools."
-    )
-
-    model_tool = QueryEngineTool.from_defaults(
-        query_engine=model_index.as_query_engine(),
-        name="Model Modules",
-        description="Useful for answering questions about using and configuring LLMs, embedding modles, and prompts."
-    )
-
-    query_tool = QueryEngineTool.from_defaults(
-        query_engine=query_index.as_query_engine(),
-        name="Query Modules",
-        description="Useful for answering questions about query engines, query configurations, and using various parts of the query engine pipeline."
-    )
-
-    supporting_tool = QueryEngineTool.from_defaults(
-        query_engine=supporting_index.as_query_engine(),
-        name="Supporting Modules",
-        description="Useful for answering questions about supporting modules, such as callbacks, service context, and avaluation."
-    )
-
-    tutorials_tool = QueryEngineTool.from_defaults(
-        query_engine=tutorials_index.as_query_engine(),
-        name="Tutorials",
-        description="Useful for answering questions about end-to-end tutorials and giving examples of specific use-cases."
-    )
-
-    contributing_tool = QueryEngineTool.from_defaults(
-        query_engine=contributing_index.as_query_engine(),
-        name="Contributing",
-        description="Useful for answering questions about contributing to llama index, including how to contribute to the codebase and how to build documentation."
-    )
-
-    query_engine = SubQuestionQueryEngine.from_defaults(
-        query_engine_tools=[
-            getting_started_tool,
-            community_tool,
-            data_tool,
-            agent_tool,
-            model_tool,
-            query_tool,
-            supporting_tool,
-            tutorials_tool,
-            contributing_tool
-        ],
-        # enable this for streaming
-        response_synthesizer=get_response_synthesizer(streaming=True),
-        verbose=False
-    )
-
-    return query_engine
+    return QueryEngineTool(query_engine=query_engine, metadata=ToolMetadata(name=directory, description=description))
@@ -45,6 +45,12 @@ class MarkdownDocsReader(BaseReader):
            if header_match:
                # save the current text
                if current_text.strip() != "":
+                    link_matches = re.findall(r"\[.*\]\(.*\)", current_text)
+                    links = []
+                    for link_match in link_matches:
+                        link_text = link_match.split("](")[0].replace("[", "")
+                        link_url = link_match.split("](")[1].replace(")", "")
+                        links.append(f"(link_text: {link_text}, link_url: f{link_url})")
                    markdown_docs.append(
                        Document(
                            text=current_text.strip(),
@@ -52,6 +58,7 @@ class MarkdownDocsReader(BaseReader):
                                "File Name": filename, 
                                "Content Type": "text",
                                "Header Path": "/".join(header_stack),
+                                "Links": ", ".join(links),
                            },
                        )
                    )
@@ -98,6 +105,12 @@ class MarkdownDocsReader(BaseReader):
                        )
                    current_code_block = ""
                elif code_match and current_text.strip() != "":
+                    link_matches = re.findall(r"\[.*\]\(.*\)", current_text)
+                    links = []
+                    for link_match in link_matches:
+                        link_text = link_match.split("](")[0].replace("[", "")
+                        link_url = link_match.split("](")[1].replace(")", "")
+                        links.append(f"(link_text: {link_text}, link_url: f{link_url})")
                    markdown_docs.append(
                        Document(
                            text=current_text.strip(),
@@ -105,6 +118,7 @@ class MarkdownDocsReader(BaseReader):
                                "File Name": filename, 
                                "Content Type": "text",
                                "Header Path": "/".join(header_stack),
+                                "Links": ", ".join(links),
                            },
                        )
                    )