diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d383d8 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# "Training" Python code with an LLM + +## The problem + +I thought it would be neat if an LLM could go through all my emails and tell me all the places I've traveled to in the world by extracting the destinations from the flight itineraries. + +## Set up: connecting to GMail + +[LlamaHub](https://llamahub.ai) has a [GMail tool](https://llamahub.ai/l/tools/llama-index-tools-google) for use by agents. This is where I went first. You have to dance the authentication dance with Google first, however. Here's what I did: + +* Created a new project in the [Google Cloud Console](https://console.cloud.google.com/) +* Went to APIs & Services -> Library and searched for GMail, then enabled that API +* Went to APIs & Services -> Credentials and created a new OAuth client ID + * Application type: Web application + * Authorized redirect URIs: http://localhost:8080/ (the last slash seems important) +* Went to APIs & Services -> OAuth consent screen and made the app external, which allowed me to connect my personal GMail to it once I explicitly added it as an allowed test user +* Downloaded the credentials JSON file and saved it as `credentials.json` in the root of my project + +Unfortunately, the GMail tool doesn't have a way of paginating through lots of results, so I copied and modified it, which you'll find in [gmail.py](gmail.py). + +## First pass: get the LLM to categorize each email + +In [summarize.py](summarize.py) you can see my first attempt, where I run through every message in a search matching "your flight itinerary" and try to get the LLM to categorize it, spitting out JSON every time. This works! But it's very slow, and it also uses up hella tokens -- it could get expensive! + +## Second attempt: get the LLM to generate an email categorization script + +In [generate.py](generate.py) you can see my second solution: instead of running the LLM on every email, I get it to run through a subset of them. For each email, I give it the body of the email as well as a Python function whose purpose is to detect if an email body is an itinerary (this starts off just being an empty string). + +If the LLM thinks the email is an itinerary, it is instructed to modify the Python function so that the email would be detected. It's also instructed to make sure the previous emails would still be detected. So it iterates, making a progressively more complicated Python function every time, that can detect more and more itineraries. + +In [sample_generated_code.py](sample_generated_code.py) you can see the output of this process after running through about 100 emails, not all of which were actually itineraries (lots of spam from airlines matches the search). You can see it's slowly iterating towards having a detection block for each individual airline, which is what I imagine I would have come up with as a human anyway, but with a lot more futzing around. + +## Further work + +Some next steps that have occurred to me: + +* Improve the search string to exclude more spam so it gets trained on more actual itineraries (it's reading a lot of spam right now) +* Use a local model to save me money (I've been looking at `codestral`, the latest from Mistral. Meta's `llama3` wasn't able to do it.) +* Explicitly include in the prompt instructions to combine detection blocks when possible. This seems complicated! Not sure if it will be able to do that. diff --git a/generate.py b/generate.py index b0f5858..5c78fb8 100644 --- a/generate.py +++ b/generate.py @@ -6,6 +6,7 @@ dotenv.load_dotenv() from llama_index.core import Settings from llama_index.llms.openai import OpenAI from llama_index.llms.gemini import Gemini +from llama_index.llms.ollama import Ollama import tiktoken import json @@ -14,16 +15,18 @@ searcher = GmailSearcher() # if using openAI, this specifies which model to use # and it will use the same model for counting tokens. -# If using Gemini, it will count as if for an openAI model because I'm lazy +# If using non-OpenAI model, it will count as if for an openAI model because I'm lazy #MODEL="gemini-1.5-pro-latest" #MODEL = "gpt-4o" MODEL = "gpt-3.5-turbo" #Settings.llm = OpenAI(model=MODEL) -Settings.llm = Gemini( - model="models/gemini-1.5-pro-latest", - temperature=0.1 -) +# Settings.llm = Gemini( +# model="models/gemini-1.5-pro-latest", +# temperature=0.1 +# ) + +Settings.llm = Ollama(model="llama3", request_timeout=30.0) # some emails have attachments and are enormous and hard to parse # so we slice everything down to 128k tokens or less. diff --git a/sample_generated_code.py b/sample_generated_code.py index c617418..e9c5263 100644 --- a/sample_generated_code.py +++ b/sample_generated_code.py @@ -1,90 +1,173 @@ import re from typing import Dict, List, Any -def extract_itinerary(email_body: str) -> List[Dict[str, Any]]: - """Extracts flight itineraries from email body. + +def extract_itinerary_details(email_body: str) -> List[Dict[str, Any]]: + """Extracts itinerary details from an email body. + + Args: + email_body: The email body text. + Returns: - A list of dictionaries, each containing the origin and destination of a flight. + A list of dictionaries, where each dictionary represents an itinerary + and contains the following keys: + - isItinerary: True if the email is an itinerary, False otherwise + - origin: The origin of the flight + - destination: The destination of the flight """ itineraries = [] - # Example: "Confirmation code: WQISDB" - confirmation_code_match = re.search(r'Confirmation code:\s*(\w+)', email_body) - if confirmation_code_match: - # Example: "3:59 PM\n BZE\n Belize City, Belize" - origin_match = re.search(r'(\d{1,2}:\d{2} [AP]M)\s+(\w{3})\s+([\w\s,]+)', email_body) - # Example: "11:59 PM \n SFO\n San Francisco" - destination_match = re.search(r'(\d{1,2}:\d{2} [AP]M)\s*\n?\s*(\w{3})\s*\n?\s*([\w\s,]+)', email_body) - if origin_match and destination_match: - origin_city = origin_match.group(3).strip() - destination_city = destination_match.group(3).strip() + + # Example: Extract origin and destination from Alaska Airlines email + if 'alaskaair' in email_body.lower(): + match = re.search(r'from ([A-Z]+).* to ([A-Z]+)', email_body) + if match: + origin = match.group(1) + destination = match.group(2) itineraries.append({ "isItinerary": True, - "origin": origin_city, - "destination": destination_city + "origin": origin, + "destination": destination }) - # Example:

6:00 AM

- # Example:

SFO

- # Example:

San Francisco

- if not itineraries: - origin_match = re.search(r'

.*?(\d{1,2}:\d{2} [AP]M)<\/p>\s*

(\w{3})<\/p>\s*

([\w\s,]+)<\/p>', email_body, re.DOTALL) - destination_match = re.search(r'

.*?(\d{1,2}:\d{2} [AP]M)<\/p>\s*

(\w{3})<\/p>\s*

([\w\s,]+)<\/p>', email_body, re.DOTALL) - if origin_match and destination_match: - origin_city = origin_match.group(3).strip() - destination_city = destination_match.group(3).strip() + + # Extract origin and destination from Alaska Airlines email + if 'Confirmation code' in email_body: + match = re.search(r'(\d{2}:\d{2} [AP]M)\s+([A-Z]+)\s+
\s+([A-Za-z]+, [A-Za-z]+)\s+
.*\d{2}:\d{2} [AP]M\s+([A-Z]+)\s+
\s+([A-Za-z]+, [A-Za-z]+)', email_body) + if match: + origin = match.group(2) + ', ' + match.group(3) + destination = match.group(4) + ', ' + match.group(5) itineraries.append({ "isItinerary": True, - "origin": origin_city, - "destination": destination_city + "origin": origin, + "destination": destination }) - # TripIt itinerary - if not itineraries: - origin_match = re.search(r'your trip to ([\w\s,]+) starts', email_body) - if origin_match: - origin_city = origin_match.group(1).strip() + + # Extract origin and destination from JetBlue email + if 'Prices shown:' in email_body: + match = re.search(r'Prices shown: ([A-Z]+) to ([A-Z]+)\.', email_body) + if match: + origin = match.group(1) + destination = match.group(2) itineraries.append({ "isItinerary": True, - "origin": origin_city, - "destination": origin_city # TripIt doesn't include the destination in this email + "origin": origin, + "destination": destination }) - # Alaska Airlines Itinerary - if not itineraries: - origin_match = re.search(r'\*\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2} \d{1,2}:\d{2} [AP]M \*\n\* (\w{3}) \*\n([\w\s,]+)', email_body) - destination_match = re.search(r'\*\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2} \d{1,2}:\d{2} [AP]M \*\n\* (\w{3}) \*\n([\w\s,]+)', email_body) - if origin_match and destination_match: - origin_city = origin_match.group(4).strip() - destination_city = destination_match.group(4).strip() + + # Extract origin and destination from this Alaska Airlines email + if 'Your confirmation receipt:' in email_body: + match = re.search(r'\* ([A-Z]{3}) \*.*\* ([A-Z]{3}) \*', email_body) + if match: + origin = match.group(1) + destination = match.group(2) itineraries.append({ "isItinerary": True, - "origin": origin_city, - "destination": destination_city + "origin": origin, + "destination": destination }) - # JetBlue Itinerary - if not itineraries: - origin_match = re.search(r'mi_origin=3D(\w{3})', email_body) - destination_match = re.search(r'mi_destination=3D(\w{3})', email_body) - if origin_match and destination_match: - origin_city = origin_match.group(1).strip() - destination_city = destination_match.group(1).strip() + # Extract origin and destination from this Alaska Airlines email + if 'Alaska\nFlight' in email_body: + match = re.search(r'06:00 AM\n([A-Z]{3}).*\n.*\n02:49 PM\n([A-Z]{3})', email_body) + if match: + origin = match.group(1) + destination = match.group(2) itineraries.append({ "isItinerary": True, - "origin": origin_city, - "destination": destination_city + "origin": origin, + "destination": destination }) - # Alaska Airlines Itinerary - alternate format - if not itineraries: - origin_match = re.search(r'Confirmation code:\n\*(\w+)\*<\/a>\n\n\n\*Alaska\*\nFlight \d+\n.*?\n\*\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2} \d{1,2}:\d{2} [AP]M \*\n\* (\w{3}) \*\n([\w\s,]+)\n', email_body) - destination_match = re.search(r'Confirmation code:\n\*(\w+)\*<\/a>\n\n\n\*Alaska\*\nFlight \d+\n.*?\n\*\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2} \d{1,2}:\d{2} [AP]M \*\n\* (\w{3}) \*\n([\w\s,]+)\n.*?\n\*\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2} \d{1,2}:\d{2} [AP]M \*\n\* (\w{3}) \*\n([\w\s,]+)\n', email_body) - if origin_match and destination_match: - origin_city = origin_match.group(5).strip() - destination_city = destination_match.group(8).strip() + # Extract origin and destination from this JetBlue email + if 'Your flight from' in email_body: + match = re.search(r'Your flight from ([A-Za-z]+) departs.*\.', email_body) + if match: + origin = match.group(1) + match = re.search(r'Check in for your flight to ([A-Za-z ]+)\.', email_body) + if match: + destination = match.group(1) + itineraries.append({ + "isItinerary": True, + "origin": origin, + "destination": destination + }) + # Extract origin and destination from this JetBlue email + if 'Your JetBlue confirmation code is' in email_body: + match = re.search(r'nowrap style=3D=22font-family: Arial,sans-serif;font-size:16px;padding-bottom:10px;color:=23000064=22>\n ([A-Z]{3})  \n 3D=22=22/\n  =20\n ([A-Z]{3})', email_body) + if match: + origin = match.group(1) + destination = match.group(2) itineraries.append({ "isItinerary": True, - "origin": origin_city, - "destination": destination_city + "origin": origin, + "destination": destination }) + # Extract origin and destination from this JetBlue email + if 'Check in for your flight to' in email_body: + match = re.search(r'Your flight from ([A-Za-z]+) departs.*\.', email_body) + if match: + origin = match.group(1) + match = re.search(r'Check in for your flight to ([A-Za-z]+)\.', email_body) + if match: + destination = match.group(1) + itineraries.append({ + "isItinerary": True, + "origin": origin, + "destination": destination + }) + # Extract origin and destination from this JetBlue email + if 'Your Flight Itinerary' in email_body: + match = re.search(r'nowrap style=3D=22font-family: Arial,sans-serif;font-size:16px;padding-bottom:10px;color:=23000064=22>\n ([A-Z]{3})  \n 3D=22=22/\n  =20\n ([A-Z]{3})', email_body) + if match: + origin = match.group(1) + destination = match.group(2) + itineraries.append({ + "isItinerary": True, + "origin": origin, + "destination": destination + }) + # Extract origin and destination from this JetBlue email + if 'mi_origin=' in email_body: + match = re.search(r'mi_origin=3D([A-Z]{3})&', email_body) + if match: + origin = match.group(1) + match = re.search(r'mi_destination=3D([A-Z]{3})&', email_body) + if match: + destination = match.group(2) + itineraries.append({ + "isItinerary": True, + "origin": origin, + "destination": destination + }) + # Extract origin and destination from this JetBlue email + if 'Just want flights? Put added value on the itinerary with our everyday low fares' in email_body: + match = re.search(r'mi_origin=3D([A-Z]{3})&', email_body) + if match: + origin = match.group(1) + match = re.search(r'mi_destination=3D([A-Z]{3})&', email_body) + if match: + destination = match.group(2) + itineraries.append({ + "isItinerary": True, + "origin": origin, + "destination": destination + }) + # Extract origin and destination from this Hawaiian Airlines email + if 'font-size:11pt;scrolling:no;" >OAKLAND' in email_body: + match = re.search(r'font-size:11pt;scrolling:no;" >([A-Za-z]+).*font-size:11pt;scrolling:no;" >([A-Za-z]+)', email_body) + if match: + origin = match.group(1) + destination = match.group(2) + itineraries.append({ + "isItinerary": True, + "origin": origin, + "destination": destination + }) + # Extract origin and destination from this TripIt email + if 'your trip to' in email_body: + match = re.search(r'your trip to (.*?) starts', email_body) + if match: + destination = match.group(1) + itineraries.append({ + "isItinerary": True, + "origin": None, + "destination": destination + }) + return itineraries