## This script loads data from a mongo database into an index ## This will convert all the documents in the database into vectors ## which requires a call to OpenAI for each one, so it can take some time. ## Once the data is indexed, it will be stored as a new collection in mongodb ## and you can query it without having to re-index every time. from dotenv import load_dotenv load_dotenv() # This will turn on really noisy logging if you want it, but it will slow things down # import logging # import sys # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) import os from llama_index.readers.mongo import SimpleMongoReader from pymongo.mongo_client import MongoClient from llama_index.vector_stores.azurecosmosmongo import AzureCosmosDBMongoDBVectorSearch from llama_index.indices.vector_store.base import VectorStoreIndex from llama_index.storage.storage_context import StorageContext # load objects from mongo and convert them into LlamaIndex Document objects # llamaindex has a special class that does this for you # it pulls every object in a given collection query_dict = {} reader = SimpleMongoReader(uri=os.getenv("MONGODB_URI")) documents = reader.load_data( os.getenv("MONGODB_DATABASE"), os.getenv("MONGODB_COLLECTION"), # this is the collection where the objects you loaded in 1_import got stored field_names=["full_text"], # these is a list of the top-level fields in your objects that will be indexed # make sure your objects have a field called "full_text" or that you change this value query_dict=query_dict # this is a mongo query dict that will filter your data if you don't want to index everything ) # Create a new client and connect to the server client = MongoClient(os.getenv("MONGODB_URI")) # create Atlas as a vector store store = AzureCosmosDBMongoDBVectorSearch( client, db_name=os.getenv('MONGODB_DATABASE'), collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings will be stored index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you will need to create ) # now create an index from all the Documents and store them in Atlas storage_context = StorageContext.from_defaults(vector_store=store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, show_progress=True, # this will show you a progress bar as the embeddings are created ) # you can't query your index yet because you need to create a vector search index in mongodb's UI now