mirror of
https://github.com/run-llama/ai-engineer-workshop.git
synced 2026-07-01 20:54:01 -04:00
115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
import json
|
|
import random
|
|
from llama_index import VectorStoreIndex, ServiceContext
|
|
from llama_index.llms import Anyscale, OpenAI
|
|
|
|
from data import get_embedding_model, get_postgres_store, EMBEDDING_DIMENSIONS
|
|
|
|
|
|
def _get_vector_store_index(
|
|
service_context,
|
|
embedding_model_name,
|
|
):
|
|
|
|
embed_dim = EMBEDDING_DIMENSIONS[embedding_model_name]
|
|
vector_store = get_postgres_store(embed_dim)
|
|
index = VectorStoreIndex.from_vector_store(
|
|
vector_store,
|
|
service_context=service_context
|
|
)
|
|
return index
|
|
|
|
|
|
def get_query_engine(
|
|
llm_model_name: str = "meta-llama/Llama-2-70b-chat-hf",
|
|
temperature: float = 0.1,
|
|
embedding_model_name = "text-embedding-ada-002",
|
|
similarity_top_k=2
|
|
):
|
|
embed_model = get_embedding_model(embedding_model_name)
|
|
|
|
if "llama" in llm_model_name:
|
|
llm = Anyscale(model=llm_model_name, temperature=temperature)
|
|
else:
|
|
llm = OpenAI(model=llm_model_name, temperature=temperature)
|
|
|
|
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)
|
|
|
|
index = _get_vector_store_index(service_context, embedding_model_name)
|
|
return index.as_query_engine(similarity_top_k=similarity_top_k)
|
|
|
|
|
|
def get_retriever(
|
|
embedding_model_name = "text-embedding-ada-002",
|
|
similarity_top_k=2
|
|
):
|
|
|
|
embed_model = get_embedding_model(embedding_model_name)
|
|
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)
|
|
|
|
index = _get_vector_store_index(service_context, embedding_model_name)
|
|
return index.as_query_engine(similarity_top_k=similarity_top_k)
|
|
|
|
|
|
def train_test_split(data, split_ratio=0.8):
|
|
"""
|
|
Split a list of items into training and testing sets.
|
|
|
|
Args:
|
|
data (list): The list of items to be split.
|
|
split_ratio (float): The ratio of items to include in the training set (default is 0.8).
|
|
|
|
Returns:
|
|
tuple: A tuple containing two lists - the training set and the testing set.
|
|
"""
|
|
if not 0 <= split_ratio <= 1:
|
|
raise ValueError("Split ratio must be between 0 and 1")
|
|
|
|
# Shuffle the data to ensure randomness in the split
|
|
random.shuffle(data)
|
|
|
|
# Calculate the split indices
|
|
split_index = int(len(data) * split_ratio)
|
|
|
|
# Split the data into training and testing sets
|
|
train_set = data[:split_index]
|
|
test_set = data[split_index:]
|
|
|
|
return train_set, test_set
|
|
|
|
|
|
def subsample(data, ratio):
|
|
"""
|
|
Subsample a list to a given ratio.
|
|
|
|
Args:
|
|
data (list): The list of items to be subsampled.
|
|
ratio (float): The ratio of items to retain in the subsample.
|
|
|
|
Returns:
|
|
list: A subsampled list containing the specified ratio of items.
|
|
"""
|
|
if not 0 <= ratio <= 1:
|
|
raise ValueError("Ratio must be between 0 and 1")
|
|
|
|
# Calculate the number of items to retain in the subsample
|
|
num_items_to_retain = int(len(data) * ratio)
|
|
|
|
# Randomly select items to retain
|
|
subsampled_data = random.sample(data, num_items_to_retain)
|
|
|
|
return subsampled_data
|
|
|
|
|
|
def write_jsonl(filename, data):
|
|
"""
|
|
Write a list of dictionaries to a JSON Lines (JSONL) file.
|
|
|
|
Args:
|
|
filename (str): The name of the JSONL file to write to.
|
|
data (list): A list of dictionaries to write as JSONL objects.
|
|
"""
|
|
with open(filename, 'w', encoding='utf-8') as file:
|
|
for item in data:
|
|
json.dump(item, file, ensure_ascii=False)
|
|
file.write('\n') |