mirror of
https://github.com/run-llama/create-llama.git
synced 2026-07-02 19:14:28 -04:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c3215ccc7b | |||
| 18ca18123f | |||
| 5ecb0c9fb7 | |||
| 7e45f604e6 | |||
| bbacf0f199 | |||
| c0c6df80c7 | |||
| 3b39a12ad6 |
@@ -6,8 +6,11 @@ from app.constants import STORAGE_DIR
|
||||
|
||||
def get_vector_store():
|
||||
if not os.path.exists(STORAGE_DIR):
|
||||
# Vector store hasn't been persisted before, create a new one
|
||||
vector_store = SimpleVectorStore()
|
||||
else:
|
||||
vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR)
|
||||
vector_store.stores_text = True
|
||||
# Vector store has already been persisted before at STORAGE_DIR - load it
|
||||
vector_store = SimpleVectorStore.from_persist_dir(
|
||||
STORAGE_DIR, namespace="default"
|
||||
)
|
||||
return vector_store
|
||||
|
||||
@@ -9,6 +9,8 @@ from llama_index.core.ingestion import IngestionPipeline
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.core.vector_stores import SimpleVectorStore
|
||||
from llama_index.core.storage.docstore import SimpleDocumentStore
|
||||
from llama_index.core.storage import StorageContext
|
||||
from llama_index.core import VectorStoreIndex
|
||||
from app.constants import STORAGE_DIR
|
||||
from app.settings import init_settings
|
||||
from app.engine.loaders import get_documents
|
||||
@@ -27,15 +29,7 @@ def get_doc_store():
|
||||
return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)
|
||||
|
||||
|
||||
def generate_datasource():
|
||||
init_settings()
|
||||
logger.info("Creating new index")
|
||||
|
||||
# load the documents and create the index
|
||||
documents = get_documents()
|
||||
docstore = get_doc_store()
|
||||
vector_store = get_vector_store()
|
||||
|
||||
def run_ingestion_pipeline(docstore, vector_store, documents):
|
||||
# Create ingestion pipeline
|
||||
ingestion_pipeline = IngestionPipeline(
|
||||
transformations=[
|
||||
@@ -54,16 +48,48 @@ def generate_datasource():
|
||||
ingestion_pipeline.vector_store = vector_store
|
||||
|
||||
# Run the ingestion pipeline and store the results
|
||||
ingestion_pipeline.run(show_progress=True, documents=documents)
|
||||
nodes = ingestion_pipeline.run(show_progress=True, documents=documents)
|
||||
|
||||
# Default vector store only keeps data in memory, so we need to persist it
|
||||
# Can remove if using a different vector store
|
||||
return nodes
|
||||
|
||||
|
||||
def persist_storage(docstore, vector_store, nodes):
|
||||
storage_context = StorageContext.from_defaults(
|
||||
docstore=docstore,
|
||||
vector_store=vector_store,
|
||||
)
|
||||
# SimpleVectorStore does not include index by default
|
||||
# so we need to create the index manually
|
||||
# can be removed if using other vector store
|
||||
if isinstance(vector_store, SimpleVectorStore):
|
||||
vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json"))
|
||||
# Persist the docstore to apply ingestion strategy
|
||||
docstore.persist(os.path.join(STORAGE_DIR, "docstore.json"))
|
||||
VectorStoreIndex(
|
||||
nodes=nodes,
|
||||
storage_context=storage_context,
|
||||
store_nodes_override=True, # Need enable this to store the nodes and index's id
|
||||
)
|
||||
storage_context.persist(STORAGE_DIR)
|
||||
|
||||
logger.info("Finished creating new index.")
|
||||
|
||||
def generate_datasource():
|
||||
init_settings()
|
||||
logger.info("Generate index for the provided data")
|
||||
|
||||
# Get the stores and documents or create new ones
|
||||
documents = get_documents()
|
||||
docstore = get_doc_store()
|
||||
vector_store = get_vector_store()
|
||||
|
||||
# Run the ingestion pipeline
|
||||
nodes = run_ingestion_pipeline(
|
||||
docstore=docstore,
|
||||
vector_store=vector_store,
|
||||
documents=documents,
|
||||
)
|
||||
|
||||
# Build the index and persist storage
|
||||
persist_storage(docstore, vector_store, nodes)
|
||||
|
||||
logger.info("Finished generating the index")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
import logging
|
||||
from llama_index.core import load_index_from_storage
|
||||
from llama_index.core.storage import StorageContext
|
||||
from llama_index.core.indices.vector_store import VectorStoreIndex
|
||||
from llama_index.core.vector_stores.simple import SimpleVectorStore
|
||||
from app.constants import STORAGE_DIR
|
||||
from app.engine.vectordb import get_vector_store
|
||||
|
||||
logger = logging.getLogger("uvicorn")
|
||||
@@ -8,6 +12,16 @@ logger = logging.getLogger("uvicorn")
|
||||
def get_index():
|
||||
logger.info("Loading the index...")
|
||||
store = get_vector_store()
|
||||
index = VectorStoreIndex.from_vector_store(store)
|
||||
# If the store is a SimpleVectorStore, we need to load the index from the storage
|
||||
if isinstance(store, SimpleVectorStore):
|
||||
index = load_index_from_storage(
|
||||
StorageContext.from_defaults(
|
||||
vector_store=store,
|
||||
persist_dir=STORAGE_DIR,
|
||||
)
|
||||
)
|
||||
else:
|
||||
index = VectorStoreIndex.from_vector_store(store)
|
||||
|
||||
logger.info("Loaded index successfully.")
|
||||
return index
|
||||
|
||||
@@ -36,9 +36,9 @@ def init_openai():
|
||||
}
|
||||
Settings.llm = OpenAI(**config)
|
||||
|
||||
dimension = os.getenv("EMBEDDING_DIM")
|
||||
dimensions = os.getenv("EMBEDDING_DIM")
|
||||
config = {
|
||||
"model": os.getenv("EMBEDDING_MODEL"),
|
||||
"dimension": int(dimension) if dimension is not None else None,
|
||||
"dimensions": int(dimensions) if dimensions is not None else None,
|
||||
}
|
||||
Settings.embed_model = OpenAIEmbedding(**config)
|
||||
|
||||
Reference in New Issue
Block a user