mirror of
https://github.com/run-llama/llama_docs_bot.git
synced 2026-07-01 21:24:03 -04:00
add part 5 materials
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,45 @@
|
||||
import os
|
||||
|
||||
from llama_index import ServiceContext, set_global_service_context
|
||||
from llama_index.llms import OpenAI
|
||||
from llama_index.query_engine.router_query_engine import RouterQueryEngine
|
||||
from indexing import get_query_engine_tool
|
||||
|
||||
# setup a global service context
|
||||
llm = OpenAI(model="gpt-3.5-turbo-16k", temperature=0, max_tokens=512)
|
||||
# embed_model = OpenAIEmbedding(embed_batch_size=50)
|
||||
embed_model = "local:BAAI/bge-base-en" # use a local model for embeddings
|
||||
|
||||
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
|
||||
set_global_service_context(service_context)
|
||||
|
||||
|
||||
docs_directories = {
|
||||
"../docs/community": "Useful for information on community integrations with other libraries, vector dbs, and frameworks.",
|
||||
"../docs/core_modules/agent_modules": "Useful for information on data agents and tools for data agents.",
|
||||
"../docs/core_modules/data_modules": "Useful for information on data, storage, indexing, and data processing modules.",
|
||||
"../docs/core_modules/model_modules": "Useful for information on LLMs, embedding models, and prompts.",
|
||||
"../docs/core_modules/query_modules": "Useful for information on various query engines and retrievers, and anything related to querying data.",
|
||||
"../docs/core_modules/supporting_modules": "Useful for information on supporting modules, like callbacks, evaluators, and other supporting modules.",
|
||||
"../docs/getting_started": "Useful for information on getting started with LlamaIndex.",
|
||||
"../docs/development": "Useful for information on contributing to LlamaIndex development.",
|
||||
}
|
||||
|
||||
# Build query engine tools
|
||||
query_engine_tools = [
|
||||
get_query_engine_tool(directory, description) for directory, description in docs_directories.items()
|
||||
]
|
||||
|
||||
|
||||
query_engine = RouterQueryEngine.from_defaults(
|
||||
query_engine_tools=query_engine_tools,
|
||||
select_multi=True,
|
||||
service_context=service_context,
|
||||
)
|
||||
|
||||
while True:
|
||||
input_text = input("Enter a query: ").strip()
|
||||
input_text += "\nInclude relevant links from the context when it makes sense."
|
||||
response = query_engine.query(input_text)
|
||||
print(str(response))
|
||||
print("\n")
|
||||
@@ -0,0 +1,28 @@
|
||||
from typing import Callable, Optional
|
||||
|
||||
from llama_index.bridge.pydantic import PrivateAttr
|
||||
from llama_index.indices.postprocessor.types import BaseNodePostprocessor
|
||||
from llama_index.utils import globals_helper
|
||||
from llama_index.schema import MetadataMode
|
||||
|
||||
class LimitRetrievedNodesLength(BaseNodePostprocessor):
|
||||
limit: int = 3000
|
||||
_tokenizer: Callable = PrivateAttr()
|
||||
|
||||
def __init__(self, limit: int = 3000, tokenizer: Optional[Callable] = None):
|
||||
self._tokenizer = tokenizer or globals_helper.tokenizer
|
||||
super().__init__(
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
def postprocess_nodes(self, nodes, query_bundle):
|
||||
included_nodes = []
|
||||
current_length = 0
|
||||
|
||||
for node in nodes:
|
||||
current_length += len(self._tokenizer(node.node.get_content(metadata_mode=MetadataMode.LLM)))
|
||||
if current_length > self.limit:
|
||||
break
|
||||
included_nodes.append(node)
|
||||
|
||||
return included_nodes
|
||||
+52
-140
@@ -1,20 +1,23 @@
|
||||
import os
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
|
||||
from .markdown_docs_reader import MarkdownDocsReader
|
||||
from llama_index import (
|
||||
SimpleDirectoryReader,
|
||||
VectorStoreIndex,
|
||||
ServiceContext,
|
||||
VectorStoreIndex,
|
||||
StorageContext,
|
||||
load_index_from_storage,
|
||||
set_global_service_context
|
||||
load_index_from_storage
|
||||
)
|
||||
from llama_index.query_engine import SubQuestionQueryEngine
|
||||
from llama_index.response_synthesizers import get_response_synthesizer
|
||||
from llama_index.tools import QueryEngineTool
|
||||
from llama_index.query_engine import RetrieverQueryEngine
|
||||
from llama_index.node_parser import HierarchicalNodeParser, get_leaf_nodes
|
||||
from llama_index.retrievers import AutoMergingRetriever
|
||||
from llama_index.schema import Document, MetadataMode
|
||||
from llama_index.storage.docstore import SimpleDocumentStore
|
||||
from llama_index.tools import QueryEngineTool, ToolMetadata
|
||||
|
||||
|
||||
# load documents
|
||||
def load_markdown_docs(filepath):
|
||||
"""Load markdown docs from a directory, excluding all other file types."""
|
||||
loader = SimpleDirectoryReader(
|
||||
@@ -26,149 +29,58 @@ def load_markdown_docs(filepath):
|
||||
|
||||
documents = loader.load_data()
|
||||
|
||||
# exclude some metadata from the LLM
|
||||
for doc in documents:
|
||||
doc.excluded_llm_metadata_keys = ["File Name", "Content Type", "Header Path"]
|
||||
# combine all documents into one
|
||||
documents = [
|
||||
Document(text="\n\n".join(
|
||||
document.get_content(metadata_mode=MetadataMode.ALL)
|
||||
for document in documents
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def load_docs():
|
||||
getting_started_docs = load_markdown_docs("../docs/getting_started")
|
||||
community_docs = load_markdown_docs("../docs/community")
|
||||
data_docs = load_markdown_docs("../docs/core_modules/data_modules")
|
||||
agent_docs = load_markdown_docs("../docs/core_modules/agent_modules")
|
||||
model_docs = load_markdown_docs("../docs/core_modules/model_modules")
|
||||
query_docs = load_markdown_docs("../docs/core_modules/query_modules")
|
||||
supporting_docs = load_markdown_docs("../docs/core_modules/supporting_modules")
|
||||
tutorials_docs = load_markdown_docs("../docs/end_to_end_tutorials")
|
||||
contributing_docs = load_markdown_docs("../docs/development")
|
||||
|
||||
return (
|
||||
getting_started_docs,
|
||||
community_docs,
|
||||
data_docs,
|
||||
agent_docs,
|
||||
model_docs,
|
||||
query_docs,
|
||||
supporting_docs,
|
||||
tutorials_docs,
|
||||
contributing_docs,
|
||||
# chunk into 3 levels
|
||||
# majority means 2/3 are retrieved before using the parent
|
||||
large_chunk_size = 1536
|
||||
node_parser = HierarchicalNodeParser.from_defaults(
|
||||
chunk_sizes=[
|
||||
large_chunk_size,
|
||||
large_chunk_size // 3,
|
||||
]
|
||||
)
|
||||
|
||||
nodes = node_parser.get_nodes_from_documents(documents)
|
||||
return nodes, get_leaf_nodes(nodes)
|
||||
|
||||
def create_query_engine():
|
||||
"""Create a query engine."""
|
||||
getting_started_docs, community_docs, data_docs, agent_docs, model_docs, query_docs, supporting_docs, tutorials_docs, contributing_docs = load_docs()
|
||||
|
||||
def get_query_engine_tool(directory, description, postprocessors=None):
|
||||
try:
|
||||
getting_started_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./getting_started_index"))
|
||||
community_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./community_index"))
|
||||
data_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./data_index"))
|
||||
agent_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./agent_index"))
|
||||
model_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./model_index"))
|
||||
query_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./query_index"))
|
||||
supporting_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./supporting_index"))
|
||||
tutorials_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./tutorials_index"))
|
||||
contributing_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./contributing_index"))
|
||||
except Exception:
|
||||
getting_started_index = VectorStoreIndex.from_documents(getting_started_docs)
|
||||
getting_started_index.storage_context.persist(persist_dir="./getting_started_index")
|
||||
storage_context = StorageContext.from_defaults(
|
||||
persist_dir=f"./data_{os.path.basename(directory)}"
|
||||
)
|
||||
index = load_index_from_storage(storage_context)
|
||||
|
||||
community_index = VectorStoreIndex.from_documents(community_docs)
|
||||
community_index.storage_context.persist(persist_dir="./community_index")
|
||||
retriever = AutoMergingRetriever(
|
||||
index.as_retriever(similarity_top_k=12),
|
||||
storage_context=storage_context
|
||||
)
|
||||
except:
|
||||
nodes, leaf_nodes = load_markdown_docs(directory)
|
||||
|
||||
data_index = VectorStoreIndex.from_documents(data_docs)
|
||||
data_index.storage_context.persist(persist_dir="./data_index")
|
||||
docstore = SimpleDocumentStore()
|
||||
docstore.add_documents(nodes)
|
||||
storage_context = StorageContext.from_defaults(docstore=docstore)
|
||||
|
||||
agent_index = VectorStoreIndex.from_documents(agent_docs)
|
||||
agent_index.storage_context.persist(persist_dir="./agent_index")
|
||||
index = VectorStoreIndex(leaf_nodes, storage_context=storage_context)
|
||||
index.storage_context.persist(persist_dir=f"./data_{os.path.basename(directory)}")
|
||||
|
||||
model_index = VectorStoreIndex.from_documents(model_docs)
|
||||
model_index.storage_context.persist(persist_dir="./model_index")
|
||||
retriever = AutoMergingRetriever(
|
||||
index.as_retriever(similarity_top_k=12),
|
||||
storage_context=storage_context
|
||||
)
|
||||
|
||||
query_index = VectorStoreIndex.from_documents(query_docs)
|
||||
query_index.storage_context.persist(persist_dir="./query_index")
|
||||
|
||||
supporting_index = VectorStoreIndex.from_documents(supporting_docs)
|
||||
supporting_index.storage_context.persist(persist_dir="./supporting_index")
|
||||
|
||||
tutorials_index = VectorStoreIndex.from_documents(tutorials_docs)
|
||||
tutorials_index.storage_context.persist(persist_dir="./tutorials_index")
|
||||
|
||||
contributing_index = VectorStoreIndex.from_documents(contributing_docs)
|
||||
contributing_index.storage_context.persist(persist_dir="./contributing_index")
|
||||
|
||||
# create a query engine tool for each folder
|
||||
getting_started_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=getting_started_index.as_query_engine(),
|
||||
name="Getting Started",
|
||||
description="Useful for answering questions about installing and running llama index, as well as basic explanations of how llama index works."
|
||||
query_engine = RetrieverQueryEngine.from_args(
|
||||
retriever,
|
||||
node_postprocessors=postprocessors or [],
|
||||
)
|
||||
|
||||
community_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=community_index.as_query_engine(),
|
||||
name="Community",
|
||||
description="Useful for answering questions about integrations and other apps built by the community."
|
||||
)
|
||||
|
||||
data_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=data_index.as_query_engine(),
|
||||
name="Data Modules",
|
||||
description="Useful for answering questions about data loaders, documents, nodes, and index structures."
|
||||
)
|
||||
|
||||
agent_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=agent_index.as_query_engine(),
|
||||
name="Agent Modules",
|
||||
description="Useful for answering questions about data agents, agent configurations, and tools."
|
||||
)
|
||||
|
||||
model_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=model_index.as_query_engine(),
|
||||
name="Model Modules",
|
||||
description="Useful for answering questions about using and configuring LLMs, embedding modles, and prompts."
|
||||
)
|
||||
|
||||
query_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=query_index.as_query_engine(),
|
||||
name="Query Modules",
|
||||
description="Useful for answering questions about query engines, query configurations, and using various parts of the query engine pipeline."
|
||||
)
|
||||
|
||||
supporting_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=supporting_index.as_query_engine(),
|
||||
name="Supporting Modules",
|
||||
description="Useful for answering questions about supporting modules, such as callbacks, service context, and avaluation."
|
||||
)
|
||||
|
||||
tutorials_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=tutorials_index.as_query_engine(),
|
||||
name="Tutorials",
|
||||
description="Useful for answering questions about end-to-end tutorials and giving examples of specific use-cases."
|
||||
)
|
||||
|
||||
contributing_tool = QueryEngineTool.from_defaults(
|
||||
query_engine=contributing_index.as_query_engine(),
|
||||
name="Contributing",
|
||||
description="Useful for answering questions about contributing to llama index, including how to contribute to the codebase and how to build documentation."
|
||||
)
|
||||
|
||||
query_engine = SubQuestionQueryEngine.from_defaults(
|
||||
query_engine_tools=[
|
||||
getting_started_tool,
|
||||
community_tool,
|
||||
data_tool,
|
||||
agent_tool,
|
||||
model_tool,
|
||||
query_tool,
|
||||
supporting_tool,
|
||||
tutorials_tool,
|
||||
contributing_tool
|
||||
],
|
||||
# enable this for streaming
|
||||
response_synthesizer=get_response_synthesizer(streaming=True),
|
||||
verbose=False
|
||||
)
|
||||
|
||||
return query_engine
|
||||
return QueryEngineTool(query_engine=query_engine, metadata=ToolMetadata(name=directory, description=description))
|
||||
|
||||
@@ -45,6 +45,12 @@ class MarkdownDocsReader(BaseReader):
|
||||
if header_match:
|
||||
# save the current text
|
||||
if current_text.strip() != "":
|
||||
link_matches = re.findall(r"\[.*\]\(.*\)", current_text)
|
||||
links = []
|
||||
for link_match in link_matches:
|
||||
link_text = link_match.split("](")[0].replace("[", "")
|
||||
link_url = link_match.split("](")[1].replace(")", "")
|
||||
links.append(f"(link_text: {link_text}, link_url: f{link_url})")
|
||||
markdown_docs.append(
|
||||
Document(
|
||||
text=current_text.strip(),
|
||||
@@ -52,6 +58,7 @@ class MarkdownDocsReader(BaseReader):
|
||||
"File Name": filename,
|
||||
"Content Type": "text",
|
||||
"Header Path": "/".join(header_stack),
|
||||
"Links": ", ".join(links),
|
||||
},
|
||||
)
|
||||
)
|
||||
@@ -98,6 +105,12 @@ class MarkdownDocsReader(BaseReader):
|
||||
)
|
||||
current_code_block = ""
|
||||
elif code_match and current_text.strip() != "":
|
||||
link_matches = re.findall(r"\[.*\]\(.*\)", current_text)
|
||||
links = []
|
||||
for link_match in link_matches:
|
||||
link_text = link_match.split("](")[0].replace("[", "")
|
||||
link_url = link_match.split("](")[1].replace(")", "")
|
||||
links.append(f"(link_text: {link_text}, link_url: f{link_url})")
|
||||
markdown_docs.append(
|
||||
Document(
|
||||
text=current_text.strip(),
|
||||
@@ -105,6 +118,7 @@ class MarkdownDocsReader(BaseReader):
|
||||
"File Name": filename,
|
||||
"Content Type": "text",
|
||||
"Header Path": "/".join(header_stack),
|
||||
"Links": ", ".join(links),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user