From a21a33263c1fed45470ccf677f329e8ee782abf2 Mon Sep 17 00:00:00 2001 From: Adrian Lyjak Date: Sat, 27 Sep 2025 13:13:31 -0400 Subject: [PATCH] Update all of the templates to remove test-proj, and migrate from vibe-llama templates --- .gitignore | 5 ++ pyproject.toml | 38 ++++++++++++ src/web_scraping/__init__.py | 0 src/web_scraping/workflow.py | 108 +++++++++++++++++++++++++++++++++++ tests/test_placeholder.py | 12 ++++ 5 files changed, 163 insertions(+) create mode 100644 .gitignore create mode 100644 pyproject.toml create mode 100644 src/web_scraping/__init__.py create mode 100644 src/web_scraping/workflow.py create mode 100644 tests/test_placeholder.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..827c8a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +workflows.db +.venv +.env +package-lock.json +node_modules diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bb3cef5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "web-scraping" +version = "0.1.0" +description = "A workflow that, given several urls, scrapes and summarizes their content." +requires-python = ">=3.10" +readme = "README.md" +dependencies = [ + "llama-index-workflows>=2.5.0,<3.0.0", + "llama-index-llms-google-genai" +] + +[dependency-groups] +dev = [ + "hatch>=1.14.2", + "pytest>=8.4.2", + "ruff>=0.13.2", + "ty>=0.0.1a21", +] + +[tool.hatch.envs.default.scripts] +format = "ruff format ." +format-check = "ruff format --check ." +lint = "ruff check --fix ." +lint-check = ["ruff check ."] +typecheck = "ty check src" +test = "pytest" +all-check = ["format-check", "lint-check", "test"] +all-fix = ["format", "lint", "test"] + +[tool.llamadeploy] +env_files = [".env"] + +[tool.llamadeploy.workflows] +default = "web_scraping.workflow:workflow" diff --git a/src/web_scraping/__init__.py b/src/web_scraping/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/web_scraping/workflow.py b/src/web_scraping/workflow.py new file mode 100644 index 0000000..d932c89 --- /dev/null +++ b/src/web_scraping/workflow.py @@ -0,0 +1,108 @@ +from llama_index.llms.google_genai import GoogleGenAI +from llama_index.core.llms import ChatMessage +from google.genai.types import Tool, GenerateContentConfig, UrlContext +from typing import Annotated +from pydantic import BaseModel + +from workflows import Workflow, step, Context +from workflows.events import Event, StartEvent, StopEvent +from workflows.resource import Resource + + +model_id = "gemini-2.5-flash" + +url_context_tool = Tool(url_context=UrlContext()) + +config = GenerateContentConfig( + tools=[url_context_tool], + response_modalities=["TEXT"], +) + + +class URLState(BaseModel): + processed_urls: int = 0 + final_content: str = "" + + +async def get_llm(*args, **kwargs) -> GoogleGenAI: + return GoogleGenAI(model=model_id, generation_config=config) + + +class URLReadEvent(Event): + url: str + + +class URLContentEvent(Event): + content: str + + +class WebScrapeWorkflow(Workflow): + @step + async def process_urls( + self, ev: StartEvent, ctx: Context[URLState] + ) -> URLReadEvent | None: + async with ctx.store.edit_state() as state: + state.processed_urls = len(ev.urls) + for url in ev.urls: + ctx.send_event(URLReadEvent(url=url)) + + @step + async def get_url_content( + self, + ev: URLReadEvent, + llm: Annotated[GoogleGenAI, Resource(get_llm)], + ctx: Context[URLState], + ) -> URLContentEvent: + response = llm.chat( + [ + ChatMessage( + role="user", + content=f"Can you please summarize the context of this URL: {ev.url}", + ) + ] + ) + async with ctx.store.edit_state() as state: + state.final_content += ( + f"### Summary for {ev.url}\\n\\n{response.message.content}\\n\\n" + ) + return URLContentEvent(content=response.message.content or "") + + @step + async def finalize( + self, ev: URLContentEvent, ctx: Context[URLState] + ) -> StopEvent | None: + state = await ctx.store.get_state() + events = ctx.collect_events(ev, [URLContentEvent] * state.processed_urls) + if events: + return StopEvent(result=state.final_content) + + +workflow = WebScrapeWorkflow(timeout=None) + + +async def main(urls: list[str]): + w = WebScrapeWorkflow(timeout=300) + result = await w.run(urls=urls) + print(str(result)) + + +if __name__ == "__main__": + import os + import asyncio + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument( + "--url", + help="URLs whose content needs to be summarised", + required=True, + action="append", + ) + args = parser.parse_args() + + if not os.getenv("GOOGLE_API_KEY", None): + raise ValueError( + "You need to set GOOGLE_API_KEY in your environment before using this workflow" + ) + + asyncio.run(main(args.url)) diff --git a/tests/test_placeholder.py b/tests/test_placeholder.py new file mode 100644 index 0000000..3384ea0 --- /dev/null +++ b/tests/test_placeholder.py @@ -0,0 +1,12 @@ +"""Placeholder test file. + +Replace this with actual tests for your project. +""" + + +def test_placeholder() -> None: + """Placeholder test that always passes. + + Remove this test once you add real tests to your project. + """ + assert True