mirror of
https://github.com/run-llama/template-workflow-web-scraping.git
synced 2026-06-30 22:07:56 -04:00
Update all of the templates to remove test-proj, and migrate from vibe-llama templates
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
workflows.db
|
||||
.venv
|
||||
.env
|
||||
package-lock.json
|
||||
node_modules
|
||||
@@ -0,0 +1,38 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "web-scraping"
|
||||
version = "0.1.0"
|
||||
description = "A workflow that, given several urls, scrapes and summarizes their content."
|
||||
requires-python = ">=3.10"
|
||||
readme = "README.md"
|
||||
dependencies = [
|
||||
"llama-index-workflows>=2.5.0,<3.0.0",
|
||||
"llama-index-llms-google-genai"
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"hatch>=1.14.2",
|
||||
"pytest>=8.4.2",
|
||||
"ruff>=0.13.2",
|
||||
"ty>=0.0.1a21",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.default.scripts]
|
||||
format = "ruff format ."
|
||||
format-check = "ruff format --check ."
|
||||
lint = "ruff check --fix ."
|
||||
lint-check = ["ruff check ."]
|
||||
typecheck = "ty check src"
|
||||
test = "pytest"
|
||||
all-check = ["format-check", "lint-check", "test"]
|
||||
all-fix = ["format", "lint", "test"]
|
||||
|
||||
[tool.llamadeploy]
|
||||
env_files = [".env"]
|
||||
|
||||
[tool.llamadeploy.workflows]
|
||||
default = "web_scraping.workflow:workflow"
|
||||
@@ -0,0 +1,108 @@
|
||||
from llama_index.llms.google_genai import GoogleGenAI
|
||||
from llama_index.core.llms import ChatMessage
|
||||
from google.genai.types import Tool, GenerateContentConfig, UrlContext
|
||||
from typing import Annotated
|
||||
from pydantic import BaseModel
|
||||
|
||||
from workflows import Workflow, step, Context
|
||||
from workflows.events import Event, StartEvent, StopEvent
|
||||
from workflows.resource import Resource
|
||||
|
||||
|
||||
model_id = "gemini-2.5-flash"
|
||||
|
||||
url_context_tool = Tool(url_context=UrlContext())
|
||||
|
||||
config = GenerateContentConfig(
|
||||
tools=[url_context_tool],
|
||||
response_modalities=["TEXT"],
|
||||
)
|
||||
|
||||
|
||||
class URLState(BaseModel):
|
||||
processed_urls: int = 0
|
||||
final_content: str = ""
|
||||
|
||||
|
||||
async def get_llm(*args, **kwargs) -> GoogleGenAI:
|
||||
return GoogleGenAI(model=model_id, generation_config=config)
|
||||
|
||||
|
||||
class URLReadEvent(Event):
|
||||
url: str
|
||||
|
||||
|
||||
class URLContentEvent(Event):
|
||||
content: str
|
||||
|
||||
|
||||
class WebScrapeWorkflow(Workflow):
|
||||
@step
|
||||
async def process_urls(
|
||||
self, ev: StartEvent, ctx: Context[URLState]
|
||||
) -> URLReadEvent | None:
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.processed_urls = len(ev.urls)
|
||||
for url in ev.urls:
|
||||
ctx.send_event(URLReadEvent(url=url))
|
||||
|
||||
@step
|
||||
async def get_url_content(
|
||||
self,
|
||||
ev: URLReadEvent,
|
||||
llm: Annotated[GoogleGenAI, Resource(get_llm)],
|
||||
ctx: Context[URLState],
|
||||
) -> URLContentEvent:
|
||||
response = llm.chat(
|
||||
[
|
||||
ChatMessage(
|
||||
role="user",
|
||||
content=f"Can you please summarize the context of this URL: {ev.url}",
|
||||
)
|
||||
]
|
||||
)
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.final_content += (
|
||||
f"### Summary for {ev.url}\\n\\n{response.message.content}\\n\\n"
|
||||
)
|
||||
return URLContentEvent(content=response.message.content or "")
|
||||
|
||||
@step
|
||||
async def finalize(
|
||||
self, ev: URLContentEvent, ctx: Context[URLState]
|
||||
) -> StopEvent | None:
|
||||
state = await ctx.store.get_state()
|
||||
events = ctx.collect_events(ev, [URLContentEvent] * state.processed_urls)
|
||||
if events:
|
||||
return StopEvent(result=state.final_content)
|
||||
|
||||
|
||||
workflow = WebScrapeWorkflow(timeout=None)
|
||||
|
||||
|
||||
async def main(urls: list[str]):
|
||||
w = WebScrapeWorkflow(timeout=300)
|
||||
result = await w.run(urls=urls)
|
||||
print(str(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
import asyncio
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
help="URLs whose content needs to be summarised",
|
||||
required=True,
|
||||
action="append",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.getenv("GOOGLE_API_KEY", None):
|
||||
raise ValueError(
|
||||
"You need to set GOOGLE_API_KEY in your environment before using this workflow"
|
||||
)
|
||||
|
||||
asyncio.run(main(args.url))
|
||||
@@ -0,0 +1,12 @@
|
||||
"""Placeholder test file.
|
||||
|
||||
Replace this with actual tests for your project.
|
||||
"""
|
||||
|
||||
|
||||
def test_placeholder() -> None:
|
||||
"""Placeholder test that always passes.
|
||||
|
||||
Remove this test once you add real tests to your project.
|
||||
"""
|
||||
assert True
|
||||
Reference in New Issue
Block a user