add part 5 materials

This commit is contained in:
Logan Markewich
2023-09-11 15:58:11 -06:00
parent aadd139d99
commit aee003cf61
5 changed files with 1407 additions and 140 deletions
File diff suppressed because it is too large Load Diff
+45
View File
@@ -0,0 +1,45 @@
import os
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.query_engine.router_query_engine import RouterQueryEngine
from indexing import get_query_engine_tool
# setup a global service context
llm = OpenAI(model="gpt-3.5-turbo-16k", temperature=0, max_tokens=512)
# embed_model = OpenAIEmbedding(embed_batch_size=50)
embed_model = "local:BAAI/bge-base-en" # use a local model for embeddings
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
set_global_service_context(service_context)
docs_directories = {
"../docs/community": "Useful for information on community integrations with other libraries, vector dbs, and frameworks.",
"../docs/core_modules/agent_modules": "Useful for information on data agents and tools for data agents.",
"../docs/core_modules/data_modules": "Useful for information on data, storage, indexing, and data processing modules.",
"../docs/core_modules/model_modules": "Useful for information on LLMs, embedding models, and prompts.",
"../docs/core_modules/query_modules": "Useful for information on various query engines and retrievers, and anything related to querying data.",
"../docs/core_modules/supporting_modules": "Useful for information on supporting modules, like callbacks, evaluators, and other supporting modules.",
"../docs/getting_started": "Useful for information on getting started with LlamaIndex.",
"../docs/development": "Useful for information on contributing to LlamaIndex development.",
}
# Build query engine tools
query_engine_tools = [
get_query_engine_tool(directory, description) for directory, description in docs_directories.items()
]
query_engine = RouterQueryEngine.from_defaults(
query_engine_tools=query_engine_tools,
select_multi=True,
service_context=service_context,
)
while True:
input_text = input("Enter a query: ").strip()
input_text += "\nInclude relevant links from the context when it makes sense."
response = query_engine.query(input_text)
print(str(response))
print("\n")
+28
View File
@@ -0,0 +1,28 @@
from typing import Callable, Optional
from llama_index.bridge.pydantic import PrivateAttr
from llama_index.indices.postprocessor.types import BaseNodePostprocessor
from llama_index.utils import globals_helper
from llama_index.schema import MetadataMode
class LimitRetrievedNodesLength(BaseNodePostprocessor):
limit: int = 3000
_tokenizer: Callable = PrivateAttr()
def __init__(self, limit: int = 3000, tokenizer: Optional[Callable] = None):
self._tokenizer = tokenizer or globals_helper.tokenizer
super().__init__(
limit=limit,
)
def postprocess_nodes(self, nodes, query_bundle):
included_nodes = []
current_length = 0
for node in nodes:
current_length += len(self._tokenizer(node.node.get_content(metadata_mode=MetadataMode.LLM)))
if current_length > self.limit:
break
included_nodes.append(node)
return included_nodes
+52 -140
View File
@@ -1,20 +1,23 @@
import os
import nest_asyncio
nest_asyncio.apply()
from .markdown_docs_reader import MarkdownDocsReader
from llama_index import (
SimpleDirectoryReader,
VectorStoreIndex,
ServiceContext,
VectorStoreIndex,
StorageContext,
load_index_from_storage,
set_global_service_context
load_index_from_storage
)
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.tools import QueryEngineTool
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.node_parser import HierarchicalNodeParser, get_leaf_nodes
from llama_index.retrievers import AutoMergingRetriever
from llama_index.schema import Document, MetadataMode
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.tools import QueryEngineTool, ToolMetadata
# load documents
def load_markdown_docs(filepath):
"""Load markdown docs from a directory, excluding all other file types."""
loader = SimpleDirectoryReader(
@@ -26,149 +29,58 @@ def load_markdown_docs(filepath):
documents = loader.load_data()
# exclude some metadata from the LLM
for doc in documents:
doc.excluded_llm_metadata_keys = ["File Name", "Content Type", "Header Path"]
# combine all documents into one
documents = [
Document(text="\n\n".join(
document.get_content(metadata_mode=MetadataMode.ALL)
for document in documents
)
)
]
return documents
def load_docs():
getting_started_docs = load_markdown_docs("../docs/getting_started")
community_docs = load_markdown_docs("../docs/community")
data_docs = load_markdown_docs("../docs/core_modules/data_modules")
agent_docs = load_markdown_docs("../docs/core_modules/agent_modules")
model_docs = load_markdown_docs("../docs/core_modules/model_modules")
query_docs = load_markdown_docs("../docs/core_modules/query_modules")
supporting_docs = load_markdown_docs("../docs/core_modules/supporting_modules")
tutorials_docs = load_markdown_docs("../docs/end_to_end_tutorials")
contributing_docs = load_markdown_docs("../docs/development")
return (
getting_started_docs,
community_docs,
data_docs,
agent_docs,
model_docs,
query_docs,
supporting_docs,
tutorials_docs,
contributing_docs,
# chunk into 3 levels
# majority means 2/3 are retrieved before using the parent
large_chunk_size = 1536
node_parser = HierarchicalNodeParser.from_defaults(
chunk_sizes=[
large_chunk_size,
large_chunk_size // 3,
]
)
nodes = node_parser.get_nodes_from_documents(documents)
return nodes, get_leaf_nodes(nodes)
def create_query_engine():
"""Create a query engine."""
getting_started_docs, community_docs, data_docs, agent_docs, model_docs, query_docs, supporting_docs, tutorials_docs, contributing_docs = load_docs()
def get_query_engine_tool(directory, description, postprocessors=None):
try:
getting_started_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./getting_started_index"))
community_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./community_index"))
data_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./data_index"))
agent_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./agent_index"))
model_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./model_index"))
query_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./query_index"))
supporting_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./supporting_index"))
tutorials_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./tutorials_index"))
contributing_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./contributing_index"))
except Exception:
getting_started_index = VectorStoreIndex.from_documents(getting_started_docs)
getting_started_index.storage_context.persist(persist_dir="./getting_started_index")
storage_context = StorageContext.from_defaults(
persist_dir=f"./data_{os.path.basename(directory)}"
)
index = load_index_from_storage(storage_context)
community_index = VectorStoreIndex.from_documents(community_docs)
community_index.storage_context.persist(persist_dir="./community_index")
retriever = AutoMergingRetriever(
index.as_retriever(similarity_top_k=12),
storage_context=storage_context
)
except:
nodes, leaf_nodes = load_markdown_docs(directory)
data_index = VectorStoreIndex.from_documents(data_docs)
data_index.storage_context.persist(persist_dir="./data_index")
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)
agent_index = VectorStoreIndex.from_documents(agent_docs)
agent_index.storage_context.persist(persist_dir="./agent_index")
index = VectorStoreIndex(leaf_nodes, storage_context=storage_context)
index.storage_context.persist(persist_dir=f"./data_{os.path.basename(directory)}")
model_index = VectorStoreIndex.from_documents(model_docs)
model_index.storage_context.persist(persist_dir="./model_index")
retriever = AutoMergingRetriever(
index.as_retriever(similarity_top_k=12),
storage_context=storage_context
)
query_index = VectorStoreIndex.from_documents(query_docs)
query_index.storage_context.persist(persist_dir="./query_index")
supporting_index = VectorStoreIndex.from_documents(supporting_docs)
supporting_index.storage_context.persist(persist_dir="./supporting_index")
tutorials_index = VectorStoreIndex.from_documents(tutorials_docs)
tutorials_index.storage_context.persist(persist_dir="./tutorials_index")
contributing_index = VectorStoreIndex.from_documents(contributing_docs)
contributing_index.storage_context.persist(persist_dir="./contributing_index")
# create a query engine tool for each folder
getting_started_tool = QueryEngineTool.from_defaults(
query_engine=getting_started_index.as_query_engine(),
name="Getting Started",
description="Useful for answering questions about installing and running llama index, as well as basic explanations of how llama index works."
query_engine = RetrieverQueryEngine.from_args(
retriever,
node_postprocessors=postprocessors or [],
)
community_tool = QueryEngineTool.from_defaults(
query_engine=community_index.as_query_engine(),
name="Community",
description="Useful for answering questions about integrations and other apps built by the community."
)
data_tool = QueryEngineTool.from_defaults(
query_engine=data_index.as_query_engine(),
name="Data Modules",
description="Useful for answering questions about data loaders, documents, nodes, and index structures."
)
agent_tool = QueryEngineTool.from_defaults(
query_engine=agent_index.as_query_engine(),
name="Agent Modules",
description="Useful for answering questions about data agents, agent configurations, and tools."
)
model_tool = QueryEngineTool.from_defaults(
query_engine=model_index.as_query_engine(),
name="Model Modules",
description="Useful for answering questions about using and configuring LLMs, embedding modles, and prompts."
)
query_tool = QueryEngineTool.from_defaults(
query_engine=query_index.as_query_engine(),
name="Query Modules",
description="Useful for answering questions about query engines, query configurations, and using various parts of the query engine pipeline."
)
supporting_tool = QueryEngineTool.from_defaults(
query_engine=supporting_index.as_query_engine(),
name="Supporting Modules",
description="Useful for answering questions about supporting modules, such as callbacks, service context, and avaluation."
)
tutorials_tool = QueryEngineTool.from_defaults(
query_engine=tutorials_index.as_query_engine(),
name="Tutorials",
description="Useful for answering questions about end-to-end tutorials and giving examples of specific use-cases."
)
contributing_tool = QueryEngineTool.from_defaults(
query_engine=contributing_index.as_query_engine(),
name="Contributing",
description="Useful for answering questions about contributing to llama index, including how to contribute to the codebase and how to build documentation."
)
query_engine = SubQuestionQueryEngine.from_defaults(
query_engine_tools=[
getting_started_tool,
community_tool,
data_tool,
agent_tool,
model_tool,
query_tool,
supporting_tool,
tutorials_tool,
contributing_tool
],
# enable this for streaming
response_synthesizer=get_response_synthesizer(streaming=True),
verbose=False
)
return query_engine
return QueryEngineTool(query_engine=query_engine, metadata=ToolMetadata(name=directory, description=description))
+14
View File
@@ -45,6 +45,12 @@ class MarkdownDocsReader(BaseReader):
if header_match:
# save the current text
if current_text.strip() != "":
link_matches = re.findall(r"\[.*\]\(.*\)", current_text)
links = []
for link_match in link_matches:
link_text = link_match.split("](")[0].replace("[", "")
link_url = link_match.split("](")[1].replace(")", "")
links.append(f"(link_text: {link_text}, link_url: f{link_url})")
markdown_docs.append(
Document(
text=current_text.strip(),
@@ -52,6 +58,7 @@ class MarkdownDocsReader(BaseReader):
"File Name": filename,
"Content Type": "text",
"Header Path": "/".join(header_stack),
"Links": ", ".join(links),
},
)
)
@@ -98,6 +105,12 @@ class MarkdownDocsReader(BaseReader):
)
current_code_block = ""
elif code_match and current_text.strip() != "":
link_matches = re.findall(r"\[.*\]\(.*\)", current_text)
links = []
for link_match in link_matches:
link_text = link_match.split("](")[0].replace("[", "")
link_url = link_match.split("](")[1].replace(")", "")
links.append(f"(link_text: {link_text}, link_url: f{link_url})")
markdown_docs.append(
Document(
text=current_text.strip(),
@@ -105,6 +118,7 @@ class MarkdownDocsReader(BaseReader):
"File Name": filename,
"Content Type": "text",
"Header Path": "/".join(header_stack),
"Links": ", ".join(links),
},
)
)