First commit

2026-07-01 20:44:05 -04:00 · 2023-10-29 16:32:26 -07:00
commit 525a5b85ea
8 changed files with 57620 additions and 0 deletions
@@ -0,0 +1,4 @@
+flask_app/.venv
+.env
+tweets.json
+fewertweets.json
@@ -0,0 +1,27 @@
+## This script imports the tinytweets.json file into your mongo database
+## It will work for any json file containing a single array of objects
+## There's nothing specific to llamaindex going on here
+## You can get your data into mongo any way you like.
+
+json_file = 'tinytweets.json'
+
+# Load environment variables from local .env file
+from dotenv import load_dotenv
+load_dotenv()
+
+import os
+import json
+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+
+# Load the tweets from a local file
+with open(json_file, 'r') as f:
+    tweets = json.load(f)
+
+# Create a new client and connect to the server
+client = MongoClient(os.getenv('MONGODB_URI'), server_api=ServerApi('1'))
+db = client[os.getenv("MONGODB_DATABASE")]
+collection = db[os.getenv("MONGODB_COLLECTION")]
+
+# Insert the tweets into mongo
+collection.insert_many(tweets)
@@ -0,0 +1,53 @@
+## This script loads data from a mongo database into an index
+## This will convert all the documents in the database into vectors
+## which requires a call to OpenAI for each one, so it can take some time.
+## Once the data is indexed, it will be stored as a new collection in mongodb
+## and you can query it without having to re-index every time.
+from dotenv import load_dotenv
+load_dotenv()
+
+# This will turn on really noisy logging so you can be sure something is happening
+import logging
+import sys
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+
+import os
+from llama_index.readers.mongo import SimpleMongoReader
+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from llama_index.indices.vector_store.base import VectorStoreIndex
+from llama_index.storage.storage_context import StorageContext
+
+# load objects from mongo and convert them into LlamaIndex Document objects
+# llamaindex has a special class that does this for you
+# it pulls every object in a given collection
+query_dict = {}
+reader = SimpleMongoReader(uri=os.getenv("MONGODB_URI"))
+documents = reader.load_data(
+    os.getenv("MONGODB_DATABASE"),
+    os.getenv("MONGODB_COLLECTION"), # this is the collection where the objects you loaded in 1_import got stored
+    field_names=["full_text"], # these is a list of the top-level fields in your objects that will be indexed
+                               # make sure your objects have a field called "full_text" or that you change this value
+    query_dict=query_dict # this is a mongo query dict that will filter your data if you don't want to index everything
+)
+
+# Create a new client and connect to the server
+client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
+
+# create Atlas as a vector store
+store = MongoDBAtlasVectorSearch(
+    client,
+    db_name=os.getenv('MONGODB_DATABASE'),
+    collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings will be stored
+    index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you will need to create
+)
+
+# now create an index from all the Documents and store them in Atlas
+storage_context = StorageContext.from_defaults(vector_store=store)
+index = VectorStoreIndex.from_documents(
+    documents, storage_context=storage_context
+)
+
+# you can't query your index yet because you need to create a vector search index in mongodb's UI now
@@ -0,0 +1,36 @@
+## This shows how to load your pre-indexed data from mongo and query it
+## Note that you MUST manually create a vector search index before this will work
+## and you must pass in the name of that index when connecting to Mongodb below 
+from dotenv import load_dotenv
+load_dotenv()
+
+# Turns on really noisy logging
+import logging
+import sys
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+
+import os
+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from llama_index.indices.vector_store.base import VectorStoreIndex
+
+# Create a new client and connect to the server
+client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
+
+# connect to Atlas as a vector store
+store = MongoDBAtlasVectorSearch(
+    client,
+    db_name=os.getenv('MONGODB_DATABASE'), # this is the database where you stored your embeddings
+    collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings were stored in 2_load_and_index.py
+    index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you created after loading your data
+)
+index = VectorStoreIndex.from_vector_store(store)
+
+# query your data!
+# here we have customized the number of documents returned per query to 20, because tweets are really short
+query_engine = index.as_query_engine(similarity_top_k=20)
+response = query_engine.query("What does the author think of web frameworks?")
+print(response)
+
@@ -0,0 +1,6 @@
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def hello_world():
+    return 'Hello, World!'
@@ -0,0 +1,12 @@
+blinker==1.6.3
+click==8.1.7
+Flask==3.0.0
+gunicorn==21.2.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+jsonschema==3.2.0
+MarkupSafe==2.1.3
+packaging==23.2
+python-dateutil==2.8.1
+pyzmq==25.1.1
+Werkzeug==3.0.1