better log

split code to run_ingestion_pipeline and persist_storage
update comments and remove stores_index
2026-07-02 19:14:28 -04:00 · 2024-05-02 15:23:06 +07:00 · 2024-05-02 15:18:40 +07:00 · 2024-05-02 14:15:56 +07:00 · 2024-05-02 10:45:58 +07:00 · 2024-05-02 10:43:54 +07:00
4 changed files with 64 additions and 21 deletions
@@ -6,8 +6,11 @@ from app.constants import STORAGE_DIR

 def get_vector_store():
    if not os.path.exists(STORAGE_DIR):
+        # Vector store hasn't been persisted before, create a new one
        vector_store = SimpleVectorStore()
    else:
-        vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR)
-    vector_store.stores_text = True
+        # Vector store has already been persisted before at STORAGE_DIR - load it
+        vector_store = SimpleVectorStore.from_persist_dir(
+            STORAGE_DIR, namespace="default"
+        )
    return vector_store
@@ -9,6 +9,8 @@ from llama_index.core.ingestion import IngestionPipeline
 from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.vector_stores import SimpleVectorStore
 from llama_index.core.storage.docstore import SimpleDocumentStore
+from llama_index.core.storage import StorageContext
+from llama_index.core import VectorStoreIndex
 from app.constants import STORAGE_DIR
 from app.settings import init_settings
 from app.engine.loaders import get_documents
@@ -27,15 +29,7 @@ def get_doc_store():
        return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)


-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-
-    # load the documents and create the index
-    documents = get_documents()
-    docstore = get_doc_store()
-    vector_store = get_vector_store()
-
+def run_ingestion_pipeline(docstore, vector_store, documents):
    # Create ingestion pipeline
    ingestion_pipeline = IngestionPipeline(
        transformations=[
@@ -54,16 +48,48 @@ def generate_datasource():
    ingestion_pipeline.vector_store = vector_store

    # Run the ingestion pipeline and store the results
-    ingestion_pipeline.run(show_progress=True, documents=documents)
+    nodes = ingestion_pipeline.run(show_progress=True, documents=documents)

-    # Default vector store only keeps data in memory, so we need to persist it
-    # Can remove if using a different vector store
+    return nodes
+
+
+def persist_storage(docstore, vector_store, nodes):
+    storage_context = StorageContext.from_defaults(
+        docstore=docstore,
+        vector_store=vector_store,
+    )
+    # SimpleVectorStore does not include index by default
+    # so we need to create the index manually
+    # can be removed if using other vector store
    if isinstance(vector_store, SimpleVectorStore):
-        vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json"))
-    # Persist the docstore to apply ingestion strategy
-    docstore.persist(os.path.join(STORAGE_DIR, "docstore.json"))
+        VectorStoreIndex(
+            nodes=nodes,
+            storage_context=storage_context,
+            store_nodes_override=True,  # Need enable this to store the nodes and index's id
+        )
+    storage_context.persist(STORAGE_DIR)

-    logger.info("Finished creating new index.")
+
+def generate_datasource():
+    init_settings()
+    logger.info("Generate index for the provided data")
+
+    # Get the stores and documents or create new ones
+    documents = get_documents()
+    docstore = get_doc_store()
+    vector_store = get_vector_store()
+
+    # Run the ingestion pipeline
+    nodes = run_ingestion_pipeline(
+        docstore=docstore,
+        vector_store=vector_store,
+        documents=documents,
+    )
+
+    # Build the index and persist storage
+    persist_storage(docstore, vector_store, nodes)
+
+    logger.info("Finished generating the index")


 if __name__ == "__main__":
@@ -1,5 +1,9 @@
 import logging
+from llama_index.core import load_index_from_storage
+from llama_index.core.storage import StorageContext
 from llama_index.core.indices.vector_store import VectorStoreIndex
+from llama_index.core.vector_stores.simple import SimpleVectorStore
+from app.constants import STORAGE_DIR
 from app.engine.vectordb import get_vector_store

 logger = logging.getLogger("uvicorn")
@@ -8,6 +12,16 @@ logger = logging.getLogger("uvicorn")
 def get_index():
    logger.info("Loading the index...")
    store = get_vector_store()
-    index = VectorStoreIndex.from_vector_store(store)
+    # If the store is a SimpleVectorStore, we need to load the index from the storage
+    if isinstance(store, SimpleVectorStore):
+        index = load_index_from_storage(
+            StorageContext.from_defaults(
+                vector_store=store,
+                persist_dir=STORAGE_DIR,
+            )
+        )
+    else:
+        index = VectorStoreIndex.from_vector_store(store)
+
    logger.info("Loaded index successfully.")
    return index
@@ -36,9 +36,9 @@ def init_openai():
    }
    Settings.llm = OpenAI(**config)

-    dimension = os.getenv("EMBEDDING_DIM")
+    dimensions = os.getenv("EMBEDDING_DIM")
    config = {
        "model": os.getenv("EMBEDDING_MODEL"),
-        "dimension": int(dimension) if dimension is not None else None,
+        "dimensions": int(dimensions) if dimensions is not None else None,
    }
    Settings.embed_model = OpenAIEmbedding(**config)
Author	SHA1	Message	Date
leehuwuj	c3215ccc7b	better log	2024-05-02 15:23:06 +07:00
leehuwuj	18ca18123f	split code to run_ingestion_pipeline and persist_storage	2024-05-02 15:18:40 +07:00
leehuwuj	5ecb0c9fb7	update comments and remove stores_index	2024-05-02 14:15:56 +07:00
leehuwuj	7e45f604e6	Fix dimensions typo in settings.py	2024-05-02 10:45:58 +07:00
leehuwuj	bbacf0f199	refactor code and comments	2024-05-02 10:43:54 +07:00
leehuwuj	c0c6df80c7	fix redundant stashed code	2024-05-02 09:25:05 +07:00
leehuwuj	3b39a12ad6	Refactor code to persist the docstore and index in the SimpleVectorStore case	2024-05-02 08:50:09 +07:00