diff --git a/README.md b/README.md index 2176d96..1de6033 100644 --- a/README.md +++ b/README.md @@ -1,119 +1,34 @@ -# DoppelBot: Replace your CEO with an LLM +# Finetuning LLaMa + Text-to-SQL -

- Add to Slack -

+This walkthrough shows you how to fine-tune LLaMa-7B on a Text-to-SQL dataset, and then use it for inference against +any database of structured data using LlamaIndex. -DoppelBot is a Slack app that scrapes a target user's messages in Slack and fine-tunes a large language model (OpenLLaMa) to learn how to respond like them. -

+This code is taken and adapted from the Modal `doppel-bot` repo: https://github.com/modal-labs/doppel-bot. - doppel-bot in action -

+### Stack -All the components, including fine-tuning, inference and scraping are serverless and run on [Modal](https://modal.com). +- LlamaIndex +- Modal +- Hugging Face datasets +- OpenLLaMa +- Peft -## How it works -[Read the blog post](https://modal.com/docs/guide/slack-finetune). +### Steps for running -## Usage +Please see the notebook (TODO) for full instructions. -- [Install](https://aksh-at--doppel.modal.run/slack/install) the app to your Slack workspace. -- In any channel, run `/doppel `. Here, `` is either the slack handle or real name of the user you want to target. _Note: for now, we limit each workspace to one target user, and this cannot be changed after installation._ -- Wait for the bot to finish training (typically an hour). You can run the command above again to check the status. +In the meantime you can run each step individually as below: -

- /doppel command -

+Loading data: +`modal run src.load_data_sql` -- Optional: rename the bot to `-bot` (or whatever you want). - - Go to the [Manage Apps](https://app.slack.com/apps-manage/) page and find `DoppelBot`. - - Click on `App Details`. - - Click on `Configuration`. - - Scroll down to the section named `Bot User`. Click on `Edit` to change the name. +Finetuning: +`modal run --detach src.finetune_sql` -

- /doppel command -

+Inference: +`modal run src.inference_sql_llamaindex::main --query "Which city has the highest population?" --sqlite-file-path "nbs/cities.db"` -- In any public Slack channel, including `@doppel` (or the name above if you changed it) in a message will summon the bot. - -## Development - -This repo contains everything you need to run DoppelBot for yourself. - -### Set up Modal - -- Create a [Modal](http://modal.com/) account. Note that we have a waitlist at the moment—[reach out](mailto:akshat@modal.com) if you would like to be taken off it sooner. -- Install `modal-client` in your current Python virtual environment (`pip install modal-client`). -- Set up a Modal token in your environment (`modal token new`). - -### Create a Slack app - -- Go to [https://api.slack.com/apps](https://api.slack.com/apps) and click - **Create New App**. -- Select **From scratch** if asked _how_ you want to create your app. -- Name your app and select your workspace. -- Go to **Features** > **OAuth & Permissions** on the left navigation pane. - Under the **Scopes** > **Bot Token Scopes** section, add the following scopes: - - `app_mentions:read` - - `channels:history` - - `channels:join` - - `channels:read` - - `chat:write` - - `chat:write.customize` - - `commands` - - `users.profile:read` - - `users:read` -- On the same page, under the **OAuth tokens for Your Workspace** section, - click **Install to Workspace** (or reinstall if it's already installed). -- Create a Modal secret - - On the [create secret page](https://modal.com/secrets/create), select **Slack** as the type. - - Back on the Slack app settings page, go to **Settings** > **Basic Information** on the left navigation pane. - Under **App Credentials**, copy the **Signing Secret** and paste its value with the key `SLACK_SIGNING_SECRET`. - - Go to **OAuth & Permissions** again and copy the **Bot User OAuth Token** and - paste its value with the key `SLACK_BOT_TOKEN`. - - Name this secret `slack-finetune-secret`. - -### (Optional) Set up Weights & Biases - -To track your fine-tuning runs on [Weights & Biases](https://wandb.ai), you'll need to create a Weights & Biases account, and then [create a Modal secret](https://modal.com/secrets/create) with the credentials (click on **Weights & Biases** in the secrets wizard and follow the steps). Then, set [`WANDB_PROJECT`](https://github.com/modal-labs/doppel-bot/blob/aae3f8675e9052251690997557aa8d4a9ae447e6/src/common.py#L10) in `src/common.py` to the name of the project you want to use. - -### Deploy your app - -From the root directory of this repo, run `modal deploy src.bot`. This will deploy the app to Modal, and print a URL to the terminal (something like `https://aksh-at--doppel.modal.run/`). - -Now, we need to point our Slack app to this URL: - -- Go to **Features** > **Event Subscriptions** on the left navigation pane: - - Turn it on. - - Paste the URL from above into the **Request URL** field, and wait for it to be verified. - - Under **Subscribe to bot events**, click on **Add bot user event** and add `@app_mention`. - - Click **Save Changes**. -- Go to **Features** > **Slash Commands** on the left navigation pane. Click **Create New Command**. Set the command to `/doppel` and the request URL to the same URL as above. -- Return to the **Basic Information** page, and click **Install to Workspace**. - -### (Optional) Multi-workspace app - -If you just want to run the app in your own workspace, the above is all you need. If you want to distribute the app to others, you'll need to set up a multi-workspace app. To enable this, set [`MULTI_WORKSPACE_APP`](https://github.com/modal-labs/doppel-bot/blob/aae3f8675e9052251690997557aa8d4a9ae447e6/src/common.py#L8) to `True` in `src/common.py`. - -Then, you'll need to set up [Neon](https://neon.tech/), a serverless Postgres database, for storing user data: - -- Create an account and a database on [Neon](https://neon.tech/). -- Create a Modal secret with DB credentials. - - On the [create secret page](https://modal.com/secrets/create), select **Postgres** as the type. - - Fill out the values based on the host URL, database name, username and password from Neon. [This page](https://neon.tech/docs/connect/connect-from-any-app) has an example for what it should look like. - - Name this secret `neon-secret`. -- Create tables by running `modal run src.db` from the root directory of this repo. -- On the Slack app settings page, go to **Settings** > **Manage Distribution**. The **Redirect URLs** should be be `https:///slack/oauth_redirect`, where `` is the URL you received after deploying the app above. Once everything looks good, click **Activate Public Distribution**. - -Now, deploying the app with `modal deploy src.bot` will take care of setting up all the [intricacies of OAuth](https://api.slack.com/authentication/oauth-v2) for you, and create a multi-workspace Slack app that can be installed by anyone. By default, the install link is at `https:///slack/install`. - -### (Optional) Running each step manually - -If you wish, you can also run each step manually. This is useful for debugging or iterating on a specific function. - -- Scraper: `modal run src.scrape::scrape --user=""` -- Fine-tuning: `modal run --detach src.finetune --user=" Path: - return VOL_MOUNT_PATH / data_dir / user / "data_sql.jsonl" +def get_data_path(data_dir: str = "data_sql") -> Path: + return VOL_MOUNT_PATH / data_dir / "data_sql.jsonl" -def user_model_path(user: str, data_dir: str = "data_sql", checkpoint: Optional[str] = None) -> Path: - path = VOL_MOUNT_PATH / data_dir / user +def get_model_path(data_dir: str = "data_sql", checkpoint: Optional[str] = None) -> Path: + path = VOL_MOUNT_PATH / data_dir if checkpoint: path = path / checkpoint return path - -def get_user_for_team_id(team_id: Optional[str], users: list[str]) -> Optional[str]: - # Dumb: for now, we only allow one user per team. - path = VOL_MOUNT_PATH / (team_id or "data") - filtered = [] - for p in path.iterdir(): - # Check if finished fine-tuning. - if (path / p / "adapter_config.json").exists() and p.name in users: - filtered.append(p.name) - if not filtered: - return None - user = random.choice(filtered) - print(f"Randomly picked {user} out of {filtered}.") - return user diff --git a/src/download_weights.py b/src/download_weights.py index d5efa5b..1e99498 100644 --- a/src/download_weights.py +++ b/src/download_weights.py @@ -1,9 +1,7 @@ """Download weights.""" -from modal import Image, Stub - from .common import ( - stub, output_vol, VOL_MOUNT_PATH, user_data_path, user_model_path + stub, output_vol, VOL_MOUNT_PATH, get_model_path ) import os import json @@ -13,9 +11,9 @@ from pathlib import Path network_file_systems={VOL_MOUNT_PATH.as_posix(): output_vol}, cloud="gcp" ) -def load_model(user: str): +def load_model(): """Load model.""" - path = user_model_path(user) + path = get_model_path() config_path = path / "adapter_config.json" model_path = path / "adapter_model.bin" @@ -30,9 +28,9 @@ def load_model(user: str): stub.model_dict["model"] = model_data @stub.local_entrypoint() -def main(user: str, output_dir: str): +def main(output_dir: str): # copy adapter_config.json and adapter_model.bin files into dict - load_model.call(user=user) + load_model.call() model_data = stub.model_dict["model"] config_data = stub.model_dict["config"] diff --git a/src/finetune_sql.py b/src/finetune_sql.py index 219284b..901ce19 100644 --- a/src/finetune_sql.py +++ b/src/finetune_sql.py @@ -9,8 +9,8 @@ from .common import ( WANDB_PROJECT, output_vol, stub, - user_data_path, - user_model_path, + get_data_path, + get_model_path, generate_prompt_sql, ) @@ -121,28 +121,13 @@ def _train( def generate_and_tokenize_prompt(data_point): full_prompt = generate_prompt_sql( - data_point["user"], data_point["input"], data_point["context"], data_point["output"], ) tokenized_full_prompt = tokenize(full_prompt) - # tokens = tokenizer.convert_ids_to_tokens( - # tokenized_full_prompt["input_ids"] - # ) - # print(data_point, tokens) if not train_on_inputs: raise NotImplementedError("not implemented yet") - # user_prompt = generate_prompt(data_point["instruction"], data_point["input"]) - # tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos_token) - # user_prompt_len = len(tokenized_user_prompt["input_ids"]) - - # if add_eos_token: - # user_prompt_len -= 1 - - # tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][ - # user_prompt_len: - # ] # could be sped up, probably return tokenized_full_prompt model = prepare_model_for_int8_training(model) @@ -243,10 +228,10 @@ def _train( cloud="oci", allow_cross_region_volumes=True, ) -def finetune(user: str, team_id: str = ""): +def finetune(team_id: str = ""): from datasets import load_dataset - data_path = user_data_path(user).as_posix() + data_path = get_data_path().as_posix() data = load_dataset("json", data_files=data_path) num_samples = len(data["train"]) @@ -257,9 +242,9 @@ def finetune(user: str, team_id: str = ""): MODEL_PATH, data, val_set_size=val_set_size, - output_dir=user_model_path(user).as_posix(), + output_dir=get_model_path().as_posix(), wandb_project=WANDB_PROJECT, - wandb_run_name=f"openllama-{team_id}-{user}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", + wandb_run_name=f"openllama-{team_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", ) # Delete scraped data after fine-tuning diff --git a/src/inference_sql_llamaindex.py b/src/inference_sql_llamaindex.py index 4b0493a..0510c58 100644 --- a/src/inference_sql_llamaindex.py +++ b/src/inference_sql_llamaindex.py @@ -9,7 +9,7 @@ from .common import ( output_vol, stub, VOL_MOUNT_PATH, - user_model_path, + get_model_path, generate_prompt_sql ) @@ -36,7 +36,6 @@ class OpenLlamaLLM(CustomLLM, ClsMixin): def __init__( self, - user: str, max_new_tokens: int = 128, callback_manager: Optional[CallbackManager] = None, use_finetuned_model: bool = True, @@ -49,8 +48,7 @@ class OpenLlamaLLM(CustomLLM, ClsMixin): from peft import PeftModel from transformers import LlamaForCausalLM, LlamaTokenizer - self.user = user - CHECKPOINT = user_model_path(self.user) + CHECKPOINT = get_model_path() load_8bit = False device = "cuda" @@ -135,7 +133,7 @@ class OpenLlamaLLM(CustomLLM, ClsMixin): network_file_systems={VOL_MOUNT_PATH.as_posix(): output_vol}, cloud="gcp", ) -def run_query(user: str, query: str, use_finetuned_model: bool = True): +def run_query(query: str, use_finetuned_model: bool = True): """Run query.""" import pandas as pd from sqlalchemy import create_engine @@ -150,17 +148,9 @@ def run_query(user: str, query: str, use_finetuned_model: bool = True): # finetuned llama LLM num_output = 256 llm = OpenLlamaLLM( - user=user, max_new_tokens=num_output, use_finetuned_model=use_finetuned_model + max_new_tokens=num_output, use_finetuned_model=use_finetuned_model ) - # # default Llama LLM (load from our huggingface loader) - # default_llm = HuggingFaceLLM( - # context_window=2048, - # max_new_tokens=num_output, - # tokenizer_name="openlm-research/open_llama_7b", - # model_name="openlm-research/open_llama_7b" - # ) service_context = ServiceContext.from_defaults(llm=llm) - # default_service_context = ServiceContext.from_defaults(llm=default_llm) sql_path = VOL_MOUNT_PATH / "test_data.db" engine = create_engine(f'sqlite:///{sql_path}', echo=True) @@ -168,7 +158,7 @@ def run_query(user: str, query: str, use_finetuned_model: bool = True): # define custom text-to-SQL prompt with generate prompt prompt_prefix = "Dialect: {dialect}\n\n" - prompt_suffix = generate_prompt_sql(user, "{query_str}", "{schema}", output="") + prompt_suffix = generate_prompt_sql("{query_str}", "{schema}", output="") sql_prompt = Prompt(prompt_prefix + prompt_suffix) query_engine = NLSQLTableQueryEngine( @@ -179,21 +169,12 @@ def run_query(user: str, query: str, use_finetuned_model: bool = True): ) response = query_engine.query(query) - # # give baseline response too - # default_query_engine = NLSQLTableQueryEngine( - # sql_database, - # text_to_sql_prompt=sql_prompt, - # service_context=default_service_context, - # synthesize_response=False - # ) - # default_response = default_query_engine.query(query) - print(f'Model output: {str(response.metadata["sql_query"])}') return response @stub.local_entrypoint() -def main(user: str, query: str, sqlite_file_path: str, use_finetuned_model: Optional[bool] = None): +def main(query: str, sqlite_file_path: str, use_finetuned_model: Optional[bool] = None): """Main function.""" fp = open(sqlite_file_path, "rb") @@ -201,7 +182,7 @@ def main(user: str, query: str, sqlite_file_path: str, use_finetuned_model: Opti if use_finetuned_model is None: # try both - run_query.call(user, query, use_finetuned_model=True) - run_query.call(user, query, use_finetuned_model=False) + run_query.call(query, use_finetuned_model=True) + run_query.call(query, use_finetuned_model=False) else: - run_query.call(user, query, use_finetuned_model=use_finetuned_model) + run_query.call(query, use_finetuned_model=use_finetuned_model) diff --git a/src/list_files.py b/src/list_files.py deleted file mode 100644 index a351b83..0000000 --- a/src/list_files.py +++ /dev/null @@ -1,35 +0,0 @@ -"""List files.""" - -from modal import Image, Stub - -from .common import ( - user_data_path, - stub, output_vol, VOL_MOUNT_PATH -) -import os - - -@stub.function( - network_file_systems={VOL_MOUNT_PATH.as_posix(): output_vol}, - cloud="gcp" -) -def main(cmd: str): - import json - - # read data json - # tmp = json.load(open(VOL_MOUNT_PATH / "data" / "Jerry Liu" / "data.json")) - # print(tmp) - # print(len(tmp)) - # raise Exception - - # # read sql json - # fp = open(user_data_path("Jerry Liu"), 'r') - # data = [json.loads(line) for line in fp] - # print(data[0:2]) - # print(len(data)) - # raise Exception - - # print(os.listdir(VOL_MOUNT_PATH / "data_sql" / "Jerry Liu")) - - import subprocess - print(subprocess.run(cmd, shell=True)) \ No newline at end of file diff --git a/src/load_data_sql.py b/src/load_data_sql.py index e4a1488..ba8d450 100644 --- a/src/load_data_sql.py +++ b/src/load_data_sql.py @@ -5,7 +5,7 @@ from .common import ( stub, VOL_MOUNT_PATH, output_vol, - user_data_path + get_data_path ) @stub.function( @@ -18,13 +18,13 @@ from .common import ( network_file_systems={VOL_MOUNT_PATH.as_posix(): output_vol}, cloud="gcp", ) -def load_data_sql(user: str): +def load_data_sql(): from datasets import load_dataset dataset = load_dataset("b-mc2/sql-create-context") dataset_splits = {"train": dataset["train"]} - out_path = user_data_path(user) + out_path = get_data_path() out_path.parent.mkdir(parents=True, exist_ok=True) @@ -32,7 +32,6 @@ def load_data_sql(user: str): with open(out_path, "w") as f: for item in ds: newitem = { - "user": str(user), "input": item["question"], "context": item["context"], "output": item["answer"],