Compare commits

...

7 Commits

Author SHA1 Message Date
leehuwuj c3215ccc7b better log 2024-05-02 15:23:06 +07:00
leehuwuj 18ca18123f split code to run_ingestion_pipeline and persist_storage 2024-05-02 15:18:40 +07:00
leehuwuj 5ecb0c9fb7 update comments and remove stores_index 2024-05-02 14:15:56 +07:00
leehuwuj 7e45f604e6 Fix dimensions typo in settings.py 2024-05-02 10:45:58 +07:00
leehuwuj bbacf0f199 refactor code and comments 2024-05-02 10:43:54 +07:00
leehuwuj c0c6df80c7 fix redundant stashed code 2024-05-02 09:25:05 +07:00
leehuwuj 3b39a12ad6 Refactor code to persist the docstore and index in the SimpleVectorStore case 2024-05-02 08:50:09 +07:00
4 changed files with 64 additions and 21 deletions
@@ -6,8 +6,11 @@ from app.constants import STORAGE_DIR
def get_vector_store():
if not os.path.exists(STORAGE_DIR):
# Vector store hasn't been persisted before, create a new one
vector_store = SimpleVectorStore()
else:
vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR)
vector_store.stores_text = True
# Vector store has already been persisted before at STORAGE_DIR - load it
vector_store = SimpleVectorStore.from_persist_dir(
STORAGE_DIR, namespace="default"
)
return vector_store
@@ -9,6 +9,8 @@ from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage import StorageContext
from llama_index.core import VectorStoreIndex
from app.constants import STORAGE_DIR
from app.settings import init_settings
from app.engine.loaders import get_documents
@@ -27,15 +29,7 @@ def get_doc_store():
return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
docstore = get_doc_store()
vector_store = get_vector_store()
def run_ingestion_pipeline(docstore, vector_store, documents):
# Create ingestion pipeline
ingestion_pipeline = IngestionPipeline(
transformations=[
@@ -54,16 +48,48 @@ def generate_datasource():
ingestion_pipeline.vector_store = vector_store
# Run the ingestion pipeline and store the results
ingestion_pipeline.run(show_progress=True, documents=documents)
nodes = ingestion_pipeline.run(show_progress=True, documents=documents)
# Default vector store only keeps data in memory, so we need to persist it
# Can remove if using a different vector store
return nodes
def persist_storage(docstore, vector_store, nodes):
storage_context = StorageContext.from_defaults(
docstore=docstore,
vector_store=vector_store,
)
# SimpleVectorStore does not include index by default
# so we need to create the index manually
# can be removed if using other vector store
if isinstance(vector_store, SimpleVectorStore):
vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json"))
# Persist the docstore to apply ingestion strategy
docstore.persist(os.path.join(STORAGE_DIR, "docstore.json"))
VectorStoreIndex(
nodes=nodes,
storage_context=storage_context,
store_nodes_override=True, # Need enable this to store the nodes and index's id
)
storage_context.persist(STORAGE_DIR)
logger.info("Finished creating new index.")
def generate_datasource():
init_settings()
logger.info("Generate index for the provided data")
# Get the stores and documents or create new ones
documents = get_documents()
docstore = get_doc_store()
vector_store = get_vector_store()
# Run the ingestion pipeline
nodes = run_ingestion_pipeline(
docstore=docstore,
vector_store=vector_store,
documents=documents,
)
# Build the index and persist storage
persist_storage(docstore, vector_store, nodes)
logger.info("Finished generating the index")
if __name__ == "__main__":
@@ -1,5 +1,9 @@
import logging
from llama_index.core import load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.core.indices.vector_store import VectorStoreIndex
from llama_index.core.vector_stores.simple import SimpleVectorStore
from app.constants import STORAGE_DIR
from app.engine.vectordb import get_vector_store
logger = logging.getLogger("uvicorn")
@@ -8,6 +12,16 @@ logger = logging.getLogger("uvicorn")
def get_index():
logger.info("Loading the index...")
store = get_vector_store()
index = VectorStoreIndex.from_vector_store(store)
# If the store is a SimpleVectorStore, we need to load the index from the storage
if isinstance(store, SimpleVectorStore):
index = load_index_from_storage(
StorageContext.from_defaults(
vector_store=store,
persist_dir=STORAGE_DIR,
)
)
else:
index = VectorStoreIndex.from_vector_store(store)
logger.info("Loaded index successfully.")
return index
@@ -36,9 +36,9 @@ def init_openai():
}
Settings.llm = OpenAI(**config)
dimension = os.getenv("EMBEDDING_DIM")
dimensions = os.getenv("EMBEDDING_DIM")
config = {
"model": os.getenv("EMBEDDING_MODEL"),
"dimension": int(dimension) if dimension is not None else None,
"dimensions": int(dimensions) if dimensions is not None else None,
}
Settings.embed_model = OpenAIEmbedding(**config)