First commit

This commit is contained in:
Laurie Voss
2023-10-29 16:32:26 -07:00
commit 525a5b85ea
8 changed files with 57620 additions and 0 deletions
+4
View File
@@ -0,0 +1,4 @@
flask_app/.venv
.env
tweets.json
fewertweets.json
+27
View File
@@ -0,0 +1,27 @@
## This script imports the tinytweets.json file into your mongo database
## It will work for any json file containing a single array of objects
## There's nothing specific to llamaindex going on here
## You can get your data into mongo any way you like.
json_file = 'tinytweets.json'
# Load environment variables from local .env file
from dotenv import load_dotenv
load_dotenv()
import os
import json
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
# Load the tweets from a local file
with open(json_file, 'r') as f:
tweets = json.load(f)
# Create a new client and connect to the server
client = MongoClient(os.getenv('MONGODB_URI'), server_api=ServerApi('1'))
db = client[os.getenv("MONGODB_DATABASE")]
collection = db[os.getenv("MONGODB_COLLECTION")]
# Insert the tweets into mongo
collection.insert_many(tweets)
+53
View File
@@ -0,0 +1,53 @@
## This script loads data from a mongo database into an index
## This will convert all the documents in the database into vectors
## which requires a call to OpenAI for each one, so it can take some time.
## Once the data is indexed, it will be stored as a new collection in mongodb
## and you can query it without having to re-index every time.
from dotenv import load_dotenv
load_dotenv()
# This will turn on really noisy logging so you can be sure something is happening
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import os
from llama_index.readers.mongo import SimpleMongoReader
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.storage.storage_context import StorageContext
# load objects from mongo and convert them into LlamaIndex Document objects
# llamaindex has a special class that does this for you
# it pulls every object in a given collection
query_dict = {}
reader = SimpleMongoReader(uri=os.getenv("MONGODB_URI"))
documents = reader.load_data(
os.getenv("MONGODB_DATABASE"),
os.getenv("MONGODB_COLLECTION"), # this is the collection where the objects you loaded in 1_import got stored
field_names=["full_text"], # these is a list of the top-level fields in your objects that will be indexed
# make sure your objects have a field called "full_text" or that you change this value
query_dict=query_dict # this is a mongo query dict that will filter your data if you don't want to index everything
)
# Create a new client and connect to the server
client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
# create Atlas as a vector store
store = MongoDBAtlasVectorSearch(
client,
db_name=os.getenv('MONGODB_DATABASE'),
collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings will be stored
index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you will need to create
)
# now create an index from all the Documents and store them in Atlas
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
# you can't query your index yet because you need to create a vector search index in mongodb's UI now
+36
View File
@@ -0,0 +1,36 @@
## This shows how to load your pre-indexed data from mongo and query it
## Note that you MUST manually create a vector search index before this will work
## and you must pass in the name of that index when connecting to Mongodb below
from dotenv import load_dotenv
load_dotenv()
# Turns on really noisy logging
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import os
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.indices.vector_store.base import VectorStoreIndex
# Create a new client and connect to the server
client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
# connect to Atlas as a vector store
store = MongoDBAtlasVectorSearch(
client,
db_name=os.getenv('MONGODB_DATABASE'), # this is the database where you stored your embeddings
collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings were stored in 2_load_and_index.py
index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you created after loading your data
)
index = VectorStoreIndex.from_vector_store(store)
# query your data!
# here we have customized the number of documents returned per query to 20, because tweets are really short
query_engine = index.as_query_engine(similarity_top_k=20)
response = query_engine.query("What does the author think of web frameworks?")
print(response)
Binary file not shown.
+6
View File
@@ -0,0 +1,6 @@
from flask import Flask
app = Flask(__name__)
@app.route('/')
def hello_world():
return 'Hello, World!'
+12
View File
@@ -0,0 +1,12 @@
blinker==1.6.3
click==8.1.7
Flask==3.0.0
gunicorn==21.2.0
itsdangerous==2.1.2
Jinja2==3.1.2
jsonschema==3.2.0
MarkupSafe==2.1.3
packaging==23.2
python-dateutil==2.8.1
pyzmq==25.1.1
Werkzeug==3.0.1
+57482
View File
File diff suppressed because it is too large Load Diff