Release 0.5.20 (#671 )

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
feat: [server] Add Human in the Loop example with FastAPI integration (#630 )
2026-07-02 19:14:28 -04:00 · 2025-06-02 18:02:05 +07:00 · 2025-06-02 17:47:04 +07:00 · 2025-05-31 14:08:29 +07:00 · 2025-05-30 09:52:17 +07:00 · 2025-05-29 21:05:53 +07:00
56 changed files with 1026 additions and 289 deletions
@@ -1,5 +1,11 @@
 # create-llama

+## 0.5.20
+
+### Patch Changes
+
+- 3ff0a18: fix: default header padding
+
 ## 0.5.19

 ### Patch Changes
@@ -1,6 +1,6 @@
 {
  "name": "create-llama",
-  "version": "0.5.19",
+  "version": "0.5.20",
  "description": "Create LlamaIndex-powered apps with one command",
  "keywords": [
    "rag",
@@ -4,7 +4,7 @@ import { Sparkles, Star } from "lucide-react";

 export default function Header() {
  return (
-    <div className="flex items-center justify-between px-4 pt-2">
+    <div className="flex items-center justify-between p-2 px-4">
      <div className="flex items-center gap-2">
        <Sparkles className="size-4" />
        <h1 className="font-semibold">LlamaIndex App</h1>
@@ -1,5 +1,19 @@
 # @llamaindex/server

+## 0.2.6
+
+### Patch Changes
+
+- 3ff0a18: fix: default header padding
+- df10474: fix: missing cursor pointer for button
+- 087c961: Support zod and chat-ui hooks for custom components
+
+## 0.2.5
+
+### Patch Changes
+
+- 058b376: Fix generate script for ejected project
+
 ## 0.2.4

 ### Patch Changes
@@ -4,7 +4,7 @@ import { Sparkles, Star } from "lucide-react";

 export default function Header() {
  return (
-    <div className="flex items-center justify-between px-4 pt-2">
+    <div className="flex items-center justify-between p-2 px-4">
      <div className="flex items-center gap-2">
        <Sparkles className="size-4" />
        <h1 className="font-semibold">LlamaIndex App</h1>
@@ -32,7 +32,10 @@ export default function CustomChatMessages({
            <ChatMessage.Actions />
          </ChatMessage>
        ))}
-        <ChatMessages.Empty />
+        <ChatMessages.Empty
+          heading="Hello there!"
+          subheading="I'm here to help you with your questions."
+        />
        <ChatMessages.Loading />
      </ChatMessages.List>
      <ChatStarter />
@@ -1,7 +1,7 @@
 "use client";

 import {
-  getChatUIAnnotation,
+  getAnnotationData,
  JSONValue,
  MessageAnnotation,
  MessageAnnotationType,
@@ -25,9 +25,8 @@ export const DynamicEvents = ({
  componentDefs: ComponentDef[];
  appendError: (error: string) => void;
 }) => {
-  const {
-    message: { annotations },
-  } = useChatMessage();
+  const { message } = useChatMessage();
+  const annotations = message.annotations;

  const shownWarningsRef = useRef<Set<string>>(new Set()); // track warnings
  const [hasErrors, setHasErrors] = useState(false);
@@ -43,15 +42,16 @@ export const DynamicEvents = ({

    const availableComponents = new Set(componentDefs.map((comp) => comp.type));

-    annotations.forEach((annotation: MessageAnnotation) => {
+    annotations.forEach((item: JSONValue) => {
+      const annotation = item as MessageAnnotation;
      const type = annotation.type;
-      if (!type) return; // skip if annotation doesn't have a type
+      if (!type) return; // Skip if annotation doesn't have a type

-      const events = getChatUIAnnotation(annotations, type);
+      const events = getAnnotationData<JSONValue>(message, type);

      // Skip if it's a built-in component or if we've already shown the warning
      if (
-        BUILT_IN_CHATUI_COMPONENTS.includes(type) ||
+        BUILT_IN_CHATUI_COMPONENTS.includes(type as MessageAnnotationType) ||
        shownWarningsRef.current.has(type)
      ) {
        return;
@@ -69,7 +69,7 @@ export const DynamicEvents = ({

  const components: EventComponent[] = componentDefs
    .map((comp) => {
-      const events = getChatUIAnnotation(annotations, comp.type) as JSONValue[]; // get all event data by type
+      const events = getAnnotationData<JSONValue>(message, comp.type);
      if (!events?.length) return null;
      return { ...comp, events };
    })
@@ -67,6 +67,9 @@ export const SOURCE_MAP: Record<string, () => Promise<any>> = {
    import("../../../toggle-group"),
  [`${SHADCN_IMPORT_PREFIX}/tooltip`]: () => import("../../../tooltip"),

+  ///// CHAT_UI GENERAL  /////
+  [`@llamaindex/chat-ui`]: () => import("@llamaindex/chat-ui"),
+
  ///// WIDGETS FROM CHAT_UI /////
  [`@llamaindex/chat-ui/widgets`]: () => import("@llamaindex/chat-ui/widgets"),

@@ -76,6 +79,9 @@ export const SOURCE_MAP: Record<string, () => Promise<any>> = {
  ///// UTILS /////
  [`@/components/lib/utils`]: () => import("../../../lib/utils"),
  [`@/lib/utils`]: () => import("../../../lib/utils"), // for v0 compatibility
+
+  ///// ZOD /////
+  [`zod`]: () => import("zod"),
 };

 // parse imports from code to get Function constructor arguments and component name
@@ -122,7 +128,7 @@ export async function parseImports(code: string) {
  const importPromises = imports.map(async ({ name, source }) => {
    if (!(source in SOURCE_MAP)) {
      throw new Error(
-        `Fail to import ${name} from ${source}. Reason: Module not found. \nCurrently we only support importing UI components from Shadcn components, widgets from "llamaindex/chat-ui/widgets" and icons from "lucide-react"`,
+        `Fail to import ${name} from ${source}. Reason: Module not found. \nCurrently we only support importing UI components from Shadcn components, widgets and hooks from "llamaindex/chat-ui", icons from "lucide-react" and zod for data validation.`,
      );
    }
    try {
@@ -1,7 +1,9 @@
 "use client";

-import { SourceData } from "@llamaindex/chat-ui";
-import { Markdown as MarkdownUI } from "@llamaindex/chat-ui/widgets";
+import {
+  Markdown as MarkdownUI,
+  SourceData,
+} from "@llamaindex/chat-ui/widgets";
 import { getConfig } from "../../lib/utils";
 const preprocessMedia = (content: string) => {
  // Remove `sandbox:` from the beginning of the URL before rendering markdown
@@ -4,7 +4,7 @@ import { Sparkles, Star } from "lucide-react";

 export function DefaultHeader() {
  return (
-    <div className="flex items-center justify-between px-4 pt-2">
+    <div className="flex items-center justify-between p-2 px-4">
      <div className="flex items-center gap-2">
        <Sparkles className="size-4" />
        <h1 className="font-semibold">LlamaIndex App</h1>
@@ -2,8 +2,7 @@

 import {
  Message,
-  MessageAnnotation,
-  getChatUIAnnotation,
+  getAnnotationData,
  useChatMessage,
  useChatUI,
 } from "@llamaindex/chat-ui";
@@ -21,13 +20,10 @@ export function ToolAnnotations() {
    [messages, message],
  );
  // Get the tool data from the message annotations
-  const annotations = message.annotations as MessageAnnotation[] | undefined;
-  const toolData = annotations
-    ? (getChatUIAnnotation(annotations, "tools") as unknown as ToolData[])
-    : null;
-  return toolData?.[0] ? (
-    <ChatTools data={toolData[0]} artifactVersion={artifactVersion} />
-  ) : null;
+  const toolData = getAnnotationData<ToolData>(message, "tools");
+  if (toolData.length === 0) return null;
+
+  return <ChatTools data={toolData[0]} artifactVersion={artifactVersion} />;
 }

 // TODO: Used to render outputs of tools. If needed, add more renderers here.
@@ -83,9 +79,7 @@ function getArtifactVersion(
  if (!messageId) return undefined;
  let versionIndex = 1;
  for (const m of messages) {
-    const toolData = m.annotations
-      ? (getChatUIAnnotation(m.annotations, "tools") as unknown as ToolData[])
-      : null;
+    const toolData = getAnnotationData<ToolData>(m, "tools");

    if (toolData?.some((t) => t.toolCall.name === "artifact")) {
      if ("id" in m && m.id === messageId) {
@@ -91,6 +91,13 @@
  ::file-selector-button {
    border-color: var(--color-gray-200, currentColor);
  }
+
+  /* Tailwind v4 removed cursor pointer of button and use default cursor */
+  /* https://github.com/shadcn-ui/ui/issues/6843#issuecomment-2696947980 */
+  button:not([disabled]),
+  [role="button"]:not([disabled]) {
+    cursor: pointer;
+  }
 }

@layer base {
@@ -1,7 +1,7 @@
 {
  "name": "@llamaindex/server",
  "description": "LlamaIndex Server",
-  "version": "0.2.4",
+  "version": "0.2.6",
  "type": "module",
  "main": "./dist/index.cjs",
  "module": "./dist/index.js",
@@ -65,7 +65,7 @@
    "@babel/traverse": "^7.27.0",
    "@babel/types": "^7.27.0",
    "@hookform/resolvers": "^5.0.1",
-    "@llamaindex/chat-ui": "0.4.6",
+    "@llamaindex/chat-ui": "0.4.9",
    "@radix-ui/react-accordion": "^1.2.3",
    "@radix-ui/react-alert-dialog": "^1.1.7",
    "@radix-ui/react-aspect-ratio": "^1.1.3",
@@ -12,7 +12,7 @@
    "format": "prettier --ignore-unknown --cache --check .",
    "format:write": "prettier --ignore-unknown --write .",
    "typecheck": "tsc --noEmit",
-    "generate": "tsx app\\api\\chat\\generate.ts"
+    "generate": "tsx app/api/chat/generate.ts"
  },
  "devDependencies": {
    "@eslint/eslintrc": "^3",
@@ -41,7 +41,7 @@
    "@babel/traverse": "^7.27.0",
    "@babel/types": "^7.27.0",
    "@hookform/resolvers": "^5.0.1",
-    "@llamaindex/chat-ui": "0.4.5",
+    "@llamaindex/chat-ui": "0.4.9",
    "@llamaindex/env": "~0.1.30",
    "@llamaindex/openai": "~0.4.0",
    "@llamaindex/readers": "~3.1.4",
@@ -181,8 +181,8 @@ importers:
        specifier: ^5.0.1
        version: 5.0.1(react-hook-form@7.56.1(react@19.1.0))
      '@llamaindex/chat-ui':
-        specifier: 0.4.6
-        version: 0.4.6(@babel/runtime@7.27.0)(@codemirror/autocomplete@6.18.6)(@codemirror/language@6.11.0)(@codemirror/lint@6.8.5)(@codemirror/search@6.5.10)(@codemirror/state@6.5.2)(@codemirror/theme-one-dark@6.1.2)(@codemirror/view@6.36.7)(@types/react-dom@19.1.2(@types/react@19.1.2))(@types/react@19.1.2)(codemirror@6.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
+        specifier: 0.4.9
+        version: 0.4.9(@babel/runtime@7.27.0)(@codemirror/autocomplete@6.18.6)(@codemirror/language@6.11.0)(@codemirror/lint@6.8.5)(@codemirror/search@6.5.10)(@codemirror/state@6.5.2)(@codemirror/theme-one-dark@6.1.2)(@codemirror/view@6.36.7)(@types/react-dom@19.1.2(@types/react@19.1.2))(@types/react@19.1.2)(codemirror@6.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
      '@llamaindex/env':
        specifier: ~0.1.30
        version: 0.1.30
@@ -1189,8 +1189,8 @@ packages:
      zod:
        optional: true

-  '@llamaindex/chat-ui@0.4.6':
-    resolution: {integrity: sha512-XvJEv/rv//8vY9Z4RosbmTyPDQFyVaWlQFe0zrJ4inz+aYqHhYtEiSCmQGgPQG+NqWStlTwpOpCye1jy4mWciQ==}
+  '@llamaindex/chat-ui@0.4.9':
+    resolution: {integrity: sha512-KEdydC+aJ22VK/TltxIHlMWbWLfh6I0YkyVd1D/CS3FRfLt8l9jfQ/YjY10MiEd8oc1fFfk6ek/FhVWe9Szstg==}
    peerDependencies:
      react: ^18.2.0 || ^19.0.0 || ^19.0.0-rc

@@ -7219,7 +7219,7 @@ snapshots:
      p-retry: 6.2.1
      zod: 3.24.3

-  '@llamaindex/chat-ui@0.4.6(@babel/runtime@7.27.0)(@codemirror/autocomplete@6.18.6)(@codemirror/language@6.11.0)(@codemirror/lint@6.8.5)(@codemirror/search@6.5.10)(@codemirror/state@6.5.2)(@codemirror/theme-one-dark@6.1.2)(@codemirror/view@6.36.7)(@types/react-dom@19.1.2(@types/react@19.1.2))(@types/react@19.1.2)(codemirror@6.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
+  '@llamaindex/chat-ui@0.4.9(@babel/runtime@7.27.0)(@codemirror/autocomplete@6.18.6)(@codemirror/language@6.11.0)(@codemirror/lint@6.8.5)(@codemirror/search@6.5.10)(@codemirror/state@6.5.2)(@codemirror/theme-one-dark@6.1.2)(@codemirror/view@6.36.7)(@types/react-dom@19.1.2(@types/react@19.1.2))(@types/react@19.1.2)(codemirror@6.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
    dependencies:
      '@codemirror/lang-css': 6.3.1
      '@codemirror/lang-html': 6.4.9
@@ -1,5 +1,23 @@
 # @create-llama/llama-index-server

+## 0.1.20
+
+### Patch Changes
+
+- 087c961: Add support for human-in-the-loop
+- 087c961: Refactor models.py into a separate module
+- Updated dependencies [3ff0a18]
+- Updated dependencies [df10474]
+- Updated dependencies [087c961]
+  - @llamaindex/server@0.2.6
+
+## 0.1.19
+
+### Patch Changes
+
+- Updated dependencies [058b376]
+  - @llamaindex/server@0.2.5
+
 ## 0.1.18

 ### Patch Changes
@@ -8,6 +8,7 @@ LlamaIndexServer is a FastAPI-based application that allows you to quickly launc
 - Built on FastAPI for high performance and easy API development
 - Optional built-in chat UI with extendable UI components
 - Prebuilt development code
+- Human-in-the-loop (HITL) support, check out the [Human-in-the-loop](https://github.com/run-llama/create-llama/blob/main/python/llama-index-server/examples/hitl/README.md) documentation for more details.

 ## Installation

@@ -16,7 +16,8 @@ from llama_index.core.workflow import (
    Workflow,
    step,
 )
-from llama_index.server.api.models import (
+from llama_index.server.api.utils import get_last_artifact
+from llama_index.server.models import (
    Artifact,
    ArtifactEvent,
    ArtifactType,
@@ -24,7 +25,6 @@ from llama_index.server.api.models import (
    CodeArtifactData,
    UIEvent,
 )
-from llama_index.server.api.utils import get_last_artifact


 class Requirement(BaseModel):
@@ -16,7 +16,8 @@ from llama_index.core.workflow import (
    Workflow,
    step,
 )
-from llama_index.server.api.models import (
+from llama_index.server.api.utils import get_last_artifact
+from llama_index.server.models import (
    Artifact,
    ArtifactEvent,
    ArtifactType,
@@ -24,7 +25,6 @@ from llama_index.server.api.models import (
    DocumentArtifactData,
    UIEvent,
 )
-from llama_index.server.api.utils import get_last_artifact


 class DocumentRequirement(BaseModel):
@@ -4,7 +4,7 @@ import { Sparkles, Star } from "lucide-react";

 export default function Header() {
  return (
-    <div className="flex items-center justify-between px-4 pt-2">
+    <div className="flex items-center justify-between p-2 px-4">
      <div className="flex items-center gap-2">
        <Sparkles className="size-4" />
        <h1 className="font-semibold">Artifact Workflow</h1>
@@ -7,7 +7,7 @@ from examples.artifact.code_workflow import ArtifactWorkflow
 from llama_index.core.workflow import Workflow
 from llama_index.llms.openai import OpenAI
 from llama_index.server import LlamaIndexServer, UIConfig
-from llama_index.server.api.models import ChatRequest
+from llama_index.server.models import ChatRequest


 def create_workflow(chat_request: ChatRequest) -> Workflow:
@@ -0,0 +1,74 @@
+# Human in the Loop
+
+This example shows how to use the LlamaIndexServer with a human in the loop.
+
+## AgentWorkflow
+
+```bash
+uv run -- agent_workflow.py
+```
+
+## Custom Workflow
+
+```bash
+uv run -- custom_workflow.py
+```
+
+## How does it work?
+The human-in-the-loop approach used here is based on a simple idea: the workflow pauses and waits for a human response before proceeding to the next step.
+
+To do this, you will need to implement two custom events: 
+ [HumanInputEvent](../../llama_index/server/models/hitl.py#L10): This event is used to request input from the user.
+ [HumanResponseEvent](../../llama_index/server/models/hitl.py#L43): This event is sent to the workflow to resume execution with input from the user.
+
+In this example, we have implemented these two custom events:  
+
+- [CLIHumanInputEvent](events.py#L20) – to request input from the user for CLI command execution.
+- [CLIHumanResponseEvent](events.py#L8) – to resume the workflow with the response from the user.
+
+We also have a custom component, [cli_human_input.tsx](./components/cli_human_input.tsx), which displays a card that the user can update the command and choose to execute or cancel the command execution.
+
+To make the [AgentWorkflow](agent_workflow.py) work, we use the `wait_for_event()` method to wait for the human response when a tool is called.
+
+Example:
+```python
+async def cli_executor(ctx: Context, command: str) -> str:
+    """
+    This tool carefully waits for user confirmation before executing a command.
+    """
+    confirmation = await ctx.wait_for_event(
+        CLIHumanResponseEvent,
+        waiter_event=CLIHumanInputEvent(
+            data=CLICommand(command=command),
+        ),
+    )
+    if confirmation.execute:
+        # Execute the command
+        ...
+    else:
+        # Cancel the command
+        ...
+
+```
+
+And for [Custom Workflow](custom_workflow.py), we can define a step that send the `CLIHumanInputEvent` and another step that wait for the `CLIHumanResponseEvent`.
+
+Example:
+```python
+@step
+async def request_input(self, ctx: Context, ev: StartEvent) -> CLIHumanInputEvent:
+    ...
+    return CLIHumanInputEvent(
+        data=CLICommand(command=command),
+        response_event_type=CLIHumanResponseEvent,
+    )
+
+@step
+async def handle_human_response(self, ctx: Context, ev: CLIHumanResponseEvent) -> StopEvent:
+    if ev.execute:
+        # Execute the command
+        ...
+    else:
+        # Cancel the command
+        ...
+```
@@ -0,0 +1,60 @@
+import subprocess
+
+from events import CLICommand, CLIHumanInputEvent, CLIHumanResponseEvent
+from fastapi import FastAPI
+
+from llama_index.core.agent.workflow import AgentWorkflow
+from llama_index.core.workflow import Context
+from llama_index.llms.openai import OpenAI
+from llama_index.server import LlamaIndexServer, UIConfig
+
+
+async def cli_executor(ctx: Context, command: str) -> str:
+    """
+    This tool carefully waits for user confirmation before executing a command.
+    """
+    confirmation = await ctx.wait_for_event(
+        CLIHumanResponseEvent,
+        waiter_event=CLIHumanInputEvent(
+            data=CLICommand(command=command),
+        ),
+    )
+    if confirmation.execute:
+        return subprocess.check_output(confirmation.command, shell=True).decode("utf-8")
+    else:
+        return "Command execution cancelled."
+
+
+def create_workflow() -> AgentWorkflow:
+    return AgentWorkflow.from_tools_or_functions(
+        tools_or_functions=[cli_executor],
+        llm=OpenAI(model="gpt-4.1-mini"),
+        system_prompt="""
+        You are a helpful assistant that help the user execute commands.
+        You can execute commands using the cli_executor tool, don't need to ask for confirmation for triggering the tool.
+        """,
+    )
+
+
+def create_app() -> FastAPI:
+    app = LlamaIndexServer(
+        workflow_factory=create_workflow,
+        suggest_next_questions=False,
+        ui_config=UIConfig(
+            starter_questions=[
+                "List all files in the current directory",
+                "Fetch changes from the remote repository",
+            ],
+            component_dir="components",
+        ),
+    )
+    return app
+
+
+app = create_app()
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("agent_workflow:app", port=8000, reload=True)
@@ -0,0 +1,96 @@
+import { JSONValue, useChatUI } from "@llamaindex/chat-ui";
+import React, { FC, useState } from "react";
+import { Button } from "@/components/ui/button";
+import { Card, CardContent, CardFooter } from "@/components/ui/card";
+import { z } from "zod";
+
+// This schema is equivalent to the CLICommand model defined in events.py
+const CLIInputEventSchema = z.object({
+  command: z.string(),
+});
+type CLIInputEvent = z.infer<typeof CLIInputEventSchema>;
+
+
+const CLIHumanInput: FC<{
+  events: JSONValue[];
+}> = ({ events }) => {
+  const inputEvent = (events || [])
+    .map((ev) => {
+      const parseResult = CLIInputEventSchema.safeParse(ev);
+      return parseResult.success ? parseResult.data : null;
+    })
+    .filter((ev): ev is CLIInputEvent => ev !== null)
+    .at(-1);
+
+  const { append } = useChatUI();
+  const [confirmedValue, setConfirmedValue] = useState<boolean | null>(null);
+  const [editableCommand, setEditableCommand] = useState<string | undefined>(
+    inputEvent?.command,
+  );
+
+  // Update editableCommand if inputEvent changes (e.g. new event comes in)
+  React.useEffect(() => {
+    setEditableCommand(inputEvent?.command);
+  }, [inputEvent?.command]);
+
+  const handleConfirm = () => {
+    append({
+      content: "Yes",
+      role: "user",
+      annotations: [
+        {
+          type: "human_response",
+          data: {
+            execute: true,
+            command: editableCommand, // Use editable command
+          },
+        },
+      ],
+    });
+    setConfirmedValue(true);
+  };
+
+  const handleCancel = () => {
+    append({
+      content: "No",
+      role: "user",
+      annotations: [
+        {
+          type: "human_response",
+          data: {
+            execute: false,
+            command: inputEvent?.command,
+          },
+        },
+      ],
+    });
+    setConfirmedValue(false);
+  };
+
+  return (
+    <Card className="my-4">
+      <CardContent className="pt-6">
+        <p className="text-sm text-gray-700">
+          Do you want to execute the following command?
+        </p>
+        <input
+          disabled
+          type="text"
+          value={editableCommand || ""}
+          onChange={(e) => setEditableCommand(e.target.value)}
+          className="bg-gray-100 rounded p-3 my-2 text-xs font-mono text-gray-800 overflow-x-auto w-full border border-gray-300"
+        />
+      </CardContent>
+      {confirmedValue === null ? (
+        <CardFooter className="flex justify-end gap-2">
+          <>
+            <Button onClick={handleConfirm}>Yes</Button>
+            <Button onClick={handleCancel}>No</Button>
+          </>
+        </CardFooter>
+      ) : null}
+    </Card>
+  );
+};
+
+export default CLIHumanInput;
@@ -0,0 +1,109 @@
+import platform
+import subprocess
+from typing import Any
+
+from events import CLICommand, CLIHumanInputEvent, CLIHumanResponseEvent
+from fastapi import FastAPI
+
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.settings import Settings
+from llama_index.core.workflow import (
+    Context,
+    StartEvent,
+    StopEvent,
+    Workflow,
+    step,
+)
+from llama_index.server import LlamaIndexServer, UIConfig
+
+
+class CLIWorkflow(Workflow):
+    """
+    A workflow has ability to execute command line tool with human in the loop for confirmation.
+    """
+
+    default_prompt = PromptTemplate(
+        template="""
+        You are a helpful assistant who can write CLI commands to execute using {cli_language}.
+        Your task is to analyze the user's request and write a CLI command to execute.
+
+        ## User Request
+        {user_request}
+
+        Don't be verbose, only respond with the CLI command without any other text.
+        """
+    )
+
+    def __init__(self, **kwargs: Any) -> None:
+        # HITL Workflow should disable timeout otherwise, we will get a timeout error from callback
+        kwargs["timeout"] = None
+        super().__init__(**kwargs)
+
+    @step
+    async def start(self, ctx: Context, ev: StartEvent) -> CLIHumanInputEvent:
+        user_msg = ev.user_msg
+        if user_msg is None:
+            raise ValueError("Missing user_msg in StartEvent")
+        await ctx.set("user_msg", user_msg)
+        # Request LLM to generate a CLI command
+        os_name = platform.system()
+        if os_name == "Linux" or os_name == "Darwin":
+            cli_language = "bash"
+        else:
+            cli_language = "cmd"
+        prompt = self.default_prompt.format(
+            user_request=user_msg, cli_language=cli_language
+        )
+        llm = Settings.llm
+        if llm is None:
+            raise ValueError("Missing LLM in Settings")
+        response = await llm.acomplete(prompt, formatted=True)
+        command = response.text.strip()
+        if command == "":
+            raise ValueError("Couldn't generate a command")
+        # Send the command to the user for confirmation
+        await ctx.set("command", command)
+        return CLIHumanInputEvent(  # type: ignore
+            data=CLICommand(command=command),
+            response_event_type=CLIHumanResponseEvent,
+        )
+
+    @step
+    async def handle_human_response(
+        self,
+        ctx: Context,
+        ev: CLIHumanResponseEvent,  # This event is sent by LlamaIndexServer when user response
+    ) -> StopEvent:
+        # If we have human response, check the confirmation and execute the command
+        if ev.execute:
+            command = ev.command or ""
+            if command == "":
+                raise ValueError("Missing command in CLIExecutionEvent")
+            res = subprocess.run(command, shell=True, capture_output=True, text=True)
+            return StopEvent(result=res.stdout or res.stderr)
+        else:
+            return StopEvent(result=None)
+
+
+def create_app() -> FastAPI:
+    app = LlamaIndexServer(
+        workflow_factory=lambda: CLIWorkflow(),
+        suggest_next_questions=False,
+        ui_config=UIConfig(
+            starter_questions=[
+                "List all files in the current directory",
+                "Fetch changes from the remote repository",
+            ],
+            component_dir="components",
+        ),
+    )
+    return app
+
+
+app = create_app()
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("custom_workflow:app", port=8000, reload=True)
@@ -0,0 +1,34 @@
+from typing import Type
+
+from pydantic import BaseModel, Field
+
+from llama_index.server.models import HumanInputEvent, HumanResponseEvent
+
+
+class CLIHumanResponseEvent(HumanResponseEvent):
+    execute: bool = Field(
+        description="True if the human wants to execute the command, False otherwise."
+    )
+    command: str = Field(description="The command to execute.")
+
+
+class CLICommand(BaseModel):
+    command: str = Field(description="The command to execute.")
+
+
+# We need an event that extends from HumanInputEvent for HITL feature
+class CLIHumanInputEvent(HumanInputEvent):
+    """
+    CLIInputRequiredEvent is sent when the agent needs permission from the user to execute the CLI command or not.
+    Render this event by showing the command and a boolean button to execute the command or not.
+    """
+
+    event_type: str = (
+        "cli_human_input"  # used by UI to render with appropriate component
+    )
+    response_event_type: Type = (
+        CLIHumanResponseEvent  # used by workflow to resume with the correct event
+    )
+    data: CLICommand = Field(  # the data that sent to the UI for rendering
+        description="The command to execute.",
+    )
@@ -2,13 +2,14 @@ import os
 from typing import List, Optional

 from fastapi import FastAPI
+
 from llama_index.core.agent.workflow import AgentWorkflow
 from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
 from llama_index.core.settings import Settings
 from llama_index.core.tools import QueryEngineTool, ToolMetadata
 from llama_index.llms.openai import OpenAI
 from llama_index.server import LlamaIndexServer, UIConfig
-from llama_index.server.api.models import ChatRequest
+from llama_index.server.models import ChatRequest
 from llama_index.server.services.llamacloud import LlamaCloudIndex, get_index
 from llama_index.server.tools.index.citation import (
    CITATION_SYSTEM_PROMPT,
@@ -3,7 +3,7 @@ from typing import Optional
 from llama_index.core.agent.workflow import AgentWorkflow
 from llama_index.core.settings import Settings
 from llama_index.llms.openai import OpenAI
-from llama_index.server.api.models import ChatRequest
+from llama_index.server.models import ChatRequest


 def create_workflow(chat_request: Optional[ChatRequest] = None) -> AgentWorkflow:
@@ -1,4 +1,4 @@
-from .api.models import UIEvent
+from .models.ui import UIEvent
 from .server import LlamaIndexServer, UIConfig

 __all__ = ["LlamaIndexServer", "UIConfig", "UIEvent"]
@@ -3,7 +3,7 @@ from typing import Any

 from llama_index.core.agent.workflow.workflow_events import ToolCall, ToolCallResult
 from llama_index.server.api.callbacks.base import EventCallback
-from llama_index.server.api.models import AgentRunEvent
+from llama_index.server.models.ui import AgentRunEvent

 logger = logging.getLogger("uvicorn")

@@ -4,7 +4,7 @@ from typing import Any, List, Optional
 from llama_index.core.agent.workflow.workflow_events import ToolCallResult
 from llama_index.core.schema import NodeWithScore
 from llama_index.server.api.callbacks.base import EventCallback
-from llama_index.server.api.models import SourceNodesEvent
+from llama_index.server.models.source_nodes import SourceNodesEvent

 logger = logging.getLogger(__name__)

@@ -2,7 +2,7 @@ import logging
 from typing import Any, Optional

 from llama_index.server.api.callbacks.base import EventCallback
-from llama_index.server.api.models import ChatRequest
+from llama_index.server.models.chat import ChatRequest
 from llama_index.server.services.suggest_next_question import (
    SuggestNextQuestionsService,
 )
@@ -1,196 +1,2 @@
-import logging
-import os
-from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
-
-from pydantic import BaseModel, field_validator
-
-from llama_index.core.schema import NodeWithScore
-from llama_index.core.types import ChatMessage, MessageRole
-from llama_index.core.workflow import Event
-from llama_index.server.settings import server_settings
-from llama_index.server.utils import llamacloud
-
-logger = logging.getLogger("uvicorn")
-
-
-class ChatAPIMessage(BaseModel):
-    role: MessageRole
-    content: str
-    annotations: Optional[List[Any]] = None
-
-    def to_llamaindex_message(self) -> ChatMessage:
-        return ChatMessage(role=self.role, content=self.content)
-
-
-class ChatRequest(BaseModel):
-    messages: List[ChatAPIMessage]
-    data: Optional[Any] = None
-
-    @field_validator("messages")
-    def validate_messages(cls, v: List[ChatAPIMessage]) -> List[ChatAPIMessage]:
-        if v[-1].role != MessageRole.USER:
-            raise ValueError("Last message must be from user")
-        return v
-
-
-class AgentRunEventType(Enum):
-    TEXT = "text"
-    PROGRESS = "progress"
-
-
-class AgentRunEvent(Event):
-    name: str
-    msg: str
-    event_type: AgentRunEventType = AgentRunEventType.TEXT
-    data: Optional[dict] = None
-
-    def to_response(self) -> dict:
-        return {
-            "type": "agent",
-            "data": {
-                "agent": self.name,
-                "type": self.event_type.value,
-                "text": self.msg,
-                "data": self.data,
-            },
-        }
-
-
-class SourceNodesEvent(Event):
-    nodes: List[NodeWithScore]
-
-    def to_response(self) -> dict:
-        return {
-            "type": "sources",
-            "data": {
-                "nodes": [
-                    SourceNodes.from_source_node(node).model_dump()
-                    for node in self.nodes
-                ]
-            },
-        }
-
-
-class SourceNodes(BaseModel):
-    id: str
-    metadata: Dict[str, Any]
-    score: Optional[float]
-    text: str
-    url: Optional[str]
-
-    @classmethod
-    def from_source_node(cls, source_node: NodeWithScore) -> "SourceNodes":
-        metadata = source_node.node.metadata
-        url = cls.get_url_from_metadata(metadata)
-
-        return cls(
-            id=source_node.node.node_id,
-            metadata=metadata,
-            score=source_node.score,
-            text=source_node.node.text,  # type: ignore
-            url=url,
-        )
-
-    @classmethod
-    def get_url_from_metadata(
-        cls,
-        metadata: Dict[str, Any],
-        data_dir: Optional[str] = None,
-    ) -> Optional[str]:
-        url_prefix = server_settings.file_server_url_prefix
-        if data_dir is None:
-            data_dir = "data"
-        file_name = metadata.get("file_name")
-
-        if file_name and url_prefix:
-            if llamacloud.is_llamacloud_file(metadata):
-                file_name = llamacloud.get_local_file_name(metadata)
-                return f"{url_prefix}/output/llamacloud/{file_name}"
-            is_private = metadata.get("private", "false") == "true"
-            if is_private:
-                # file is a private upload
-                return f"{url_prefix}/output/uploaded/{file_name}"
-            # file is from calling the 'generate' script
-            # Get the relative path of file_path to data_dir
-            file_path = metadata.get("file_path")
-            data_dir = os.path.abspath(data_dir)
-            if file_path and data_dir:
-                relative_path = os.path.relpath(file_path, data_dir)
-                return f"{url_prefix}/data/{relative_path}"
-        # fallback to URL in metadata (e.g. for websites)
-        return metadata.get("URL")
-
-    @classmethod
-    def from_source_nodes(
-        cls, source_nodes: List[NodeWithScore]
-    ) -> List["SourceNodes"]:
-        return [cls.from_source_node(node) for node in source_nodes]
-
-
-class ComponentDefinition(BaseModel):
-    type: str
-    code: str
-    filename: str
-
-
-class UIEvent(Event):
-    type: str
-    data: BaseModel
-
-    def to_response(self) -> dict:
-        return {
-            "type": self.type,
-            "data": self.data.model_dump(),
-        }
-
-
-class ArtifactType(str, Enum):
-    CODE = "code"
-    DOCUMENT = "document"
-
-
-class CodeArtifactData(BaseModel):
-    file_name: str
-    code: str
-    language: str
-
-
-class DocumentArtifactData(BaseModel):
-    title: str
-    content: str
-    type: Literal["markdown", "html"]
-
-
-class Artifact(BaseModel):
-    created_at: Optional[int] = None
-    type: ArtifactType
-    data: Union[CodeArtifactData, DocumentArtifactData]
-
-    @classmethod
-    def from_message(cls, message: ChatAPIMessage) -> Optional["Artifact"]:
-        if not message.annotations or not isinstance(message.annotations, list):
-            return None
-
-        for annotation in message.annotations:
-            if isinstance(annotation, dict) and annotation.get("type") == "artifact":
-                try:
-                    artifact = cls.model_validate(annotation.get("data"))
-                    return artifact
-                except Exception as e:
-                    logger.warning(
-                        f"Failed to parse artifact from annotation: {annotation}. Error: {e}"
-                    )
-
-        return None
-
-
-class ArtifactEvent(Event):
-    type: str = "artifact"
-    data: Artifact
-
-    def to_response(self) -> dict:
-        return {
-            "type": self.type,
-            "data": self.data.model_dump(),
-        }
+# TODO: For backward compatibility, remove this in a minor release
+from llama_index.server.models import *  # noqa
@@ -11,7 +11,10 @@ from llama_index.core.agent.workflow.workflow_events import (
    AgentSetup,
    AgentStream,
 )
-from llama_index.core.workflow import StopEvent, Workflow
+from llama_index.core.workflow import (
+    StopEvent,
+    Workflow,
+)
 from llama_index.server.api.callbacks import (
    AgentCallTool,
    EventCallback,
@@ -20,9 +23,11 @@ from llama_index.server.api.callbacks import (
    SuggestNextQuestions,
 )
 from llama_index.server.api.callbacks.stream_handler import StreamHandler
-from llama_index.server.api.models import ChatRequest
 from llama_index.server.api.utils.vercel_stream import VercelStreamResponse
+from llama_index.server.models.chat import ChatRequest
+from llama_index.server.models.hitl import HumanInputEvent
 from llama_index.server.services.llamacloud import LlamaCloudFileService
+from llama_index.server.services.workflow import HITLWorkflowService


 def chat_router(
@@ -38,7 +43,8 @@ def chat_router(
        background_tasks: BackgroundTasks,
    ) -> StreamingResponse:
        try:
-            user_message = request.messages[-1].to_llamaindex_message()
+            last_message = request.messages[-1]
+            user_message = last_message.to_llamaindex_message()
            chat_history = [
                message.to_llamaindex_message() for message in request.messages[:-1]
            ]
@@ -48,10 +54,21 @@ def chat_router(
                workflow = workflow_factory(chat_request=request)
            else:
                workflow = workflow_factory()
-            workflow_handler = workflow.run(
-                user_msg=user_message.content,
-                chat_history=chat_history,
-            )
+
+            # Check if we should resume a chat with a human response
+            human_response = last_message.human_response
+            if human_response:
+                ctx = await HITLWorkflowService.load_context(
+                    id=request.id,
+                    workflow=workflow,
+                    data=human_response,
+                )
+                workflow_handler = workflow.run(ctx=ctx)
+            else:
+                workflow_handler = workflow.run(
+                    user_msg=user_message.content,
+                    chat_history=chat_history,
+                )

            callbacks: list[EventCallback] = [
                AgentCallTool(),
@@ -66,7 +83,11 @@ def chat_router(
            )

            return VercelStreamResponse(
-                content_generator=_stream_content(stream_handler, request, logger),
+                content_generator=_stream_content(
+                    stream_handler,
+                    logger,
+                    request.id,
+                ),
            )
        except Exception as e:
            logger.error(e)
@@ -99,8 +120,8 @@ def chat_router(

 async def _stream_content(
    handler: StreamHandler,
-    request: ChatRequest,
    logger: logging.Logger,
+    chat_id: str,
 ) -> AsyncGenerator[str, None]:
    async def _text_stream(
        event: Union[AgentStream, StopEvent],
@@ -126,6 +147,19 @@ async def _stream_content(
                async for chunk in _text_stream(event):
                    handler.accumulate_text(chunk)
                    yield VercelStreamResponse.convert_text(chunk)
+            elif isinstance(event, HumanInputEvent):
+                ctx = handler.workflow_handler.ctx
+                if ctx is None:
+                    raise RuntimeError("Context is None")
+                # Save the context with the HITL event
+                await HITLWorkflowService.save_context(
+                    id=chat_id,
+                    ctx=ctx,
+                    resume_event_type=event.response_event_type,
+                )
+                yield VercelStreamResponse.convert_data(event.to_response())
+                # Break to stop the stream
+                break
            elif isinstance(event, dict):
                yield VercelStreamResponse.convert_data(event)
            elif hasattr(event, "to_response"):
@@ -2,7 +2,8 @@ import logging
 from typing import List

 from fastapi import APIRouter
-from llama_index.server.api.models import ComponentDefinition
+
+from llama_index.server.models.ui import ComponentDefinition
 from llama_index.server.services.custom_ui import CustomUI


@@ -1,6 +1,7 @@
 from typing import List, Optional

-from llama_index.server.api.models import Artifact, ChatRequest
+from llama_index.server.models.artifacts import Artifact
+from llama_index.server.models.chat import ChatRequest


 def get_artifacts(chat_request: ChatRequest) -> List[Artifact]:
@@ -0,0 +1,34 @@
+from llama_index.server.models.artifacts import (
+    Artifact,
+    ArtifactEvent,
+    ArtifactType,
+    CodeArtifactData,
+    DocumentArtifactData,
+)
+from llama_index.server.models.chat import ChatAPIMessage, ChatRequest
+from llama_index.server.models.hitl import HumanInputEvent, HumanResponseEvent
+from llama_index.server.models.source_nodes import SourceNodes, SourceNodesEvent
+from llama_index.server.models.ui import (
+    AgentRunEvent,
+    AgentRunEventType,
+    ComponentDefinition,
+    UIEvent,
+)
+
+__all__ = [
+    "Artifact",
+    "ArtifactEvent",
+    "ArtifactType",
+    "DocumentArtifactData",
+    "CodeArtifactData",
+    "ChatAPIMessage",
+    "ChatRequest",
+    "UIEvent",
+    "ComponentDefinition",
+    "AgentRunEvent",
+    "AgentRunEventType",
+    "SourceNodes",
+    "SourceNodesEvent",
+    "HumanInputEvent",
+    "HumanResponseEvent",
+]
@@ -0,0 +1,60 @@
+import logging
+from enum import Enum
+from typing import Literal, Optional, Union
+
+from llama_index.core.workflow.events import Event
+from llama_index.server.models.chat import ChatAPIMessage
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+
+class ArtifactType(str, Enum):
+    CODE = "code"
+    DOCUMENT = "document"
+
+
+class CodeArtifactData(BaseModel):
+    file_name: str
+    code: str
+    language: str
+
+
+class DocumentArtifactData(BaseModel):
+    title: str
+    content: str
+    type: Literal["markdown", "html"]
+
+
+class Artifact(BaseModel):
+    created_at: Optional[int] = None
+    type: ArtifactType
+    data: Union[CodeArtifactData, DocumentArtifactData]
+
+    @classmethod
+    def from_message(cls, message: ChatAPIMessage) -> Optional["Artifact"]:
+        if not message.annotations or not isinstance(message.annotations, list):
+            return None
+
+        for annotation in message.annotations:
+            if isinstance(annotation, dict) and annotation.get("type") == "artifact":
+                try:
+                    artifact = cls.model_validate(annotation.get("data"))
+                    return artifact
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to parse artifact from annotation: {annotation}. Error: {e}"
+                    )
+
+        return None
+
+
+class ArtifactEvent(Event):
+    type: str = "artifact"
+    data: Artifact
+
+    def to_response(self) -> dict:
+        return {
+            "type": self.type,
+            "data": self.data.model_dump(),
+        }
@@ -0,0 +1,44 @@
+import re
+from typing import Any, List, Optional
+
+from pydantic import BaseModel, field_validator
+
+from llama_index.core.types import ChatMessage, MessageRole
+
+
+class ChatAPIMessage(BaseModel):
+    role: MessageRole
+    content: str
+    annotations: Optional[List[Any]] = None
+
+    def to_llamaindex_message(self) -> ChatMessage:
+        return ChatMessage(role=self.role, content=self.content)
+
+    @property
+    def human_response(self) -> Optional[Any]:
+        if self.annotations:
+            for annotation in self.annotations:
+                if (
+                    isinstance(annotation, dict)
+                    and annotation.get("type") == "human_response"
+                ):
+                    return annotation.get("data", {})
+        return None
+
+
+class ChatRequest(BaseModel):
+    id: str  # see https://ai-sdk.dev/docs/reference/ai-sdk-ui/use-chat#id - constant for the same chat session
+    messages: List[ChatAPIMessage]
+    data: Optional[Any] = None
+
+    @field_validator("messages")
+    def validate_messages(cls, v: List[ChatAPIMessage]) -> List[ChatAPIMessage]:
+        if v[-1].role != MessageRole.USER:
+            raise ValueError("Last message must be from user")
+        return v
+
+    @field_validator("id")
+    def validate_id(cls, v: str) -> str:
+        if re.search(r"[^a-zA-Z0-9_-]", v):
+            raise ValueError("ID contains special characters")
+        return v
@@ -0,0 +1,51 @@
+from typing import Any, Dict, Type, Union
+
+from llama_index.core.workflow.events import (
+    HumanResponseEvent as FrameworkHumanResponseEvent,
+)
+from llama_index.core.workflow.events import InputRequiredEvent
+from pydantic import BaseModel, Field
+
+
+class HumanResponseEvent(FrameworkHumanResponseEvent):
+    """
+    Use this event to send a response from a human.
+    """
+
+    def __init__(self, **kwargs: Any) -> None:
+        if "response" not in kwargs:
+            kwargs["response"] = f"Human response with data: {kwargs.get('data', {})}"
+        super().__init__(**kwargs)
+
+
+class HumanInputEvent(InputRequiredEvent):
+    """
+    Use this event to request input from a human.
+    It will block the workflow execution until the human responds.
+    """
+
+    response_event_type: Type[HumanResponseEvent] = Field(
+        description="The type of event that the workflow is waiting for.",
+    )
+    event_type: str = Field(
+        description="An identifier for the UI component that will be used to render the input.",
+    )
+    data: Union[Dict[str, Any], BaseModel] = Field(
+        description="The data to be sent to the UI component that will be used to render the input.",
+    )
+
+    def __init__(self, **kwargs: Any) -> None:
+        # Construct the prefix for InputRequiredEvent
+        event_type = kwargs.get("event_type", None)
+        data = kwargs.get("data", None)
+        if "prefix" not in kwargs:
+            kwargs["prefix"] = f"Need input for {event_type} with data: {data}"
+        super().__init__(**kwargs)
+
+    def to_response(self) -> dict:
+        return {
+            "type": self.event_type,
+            "data": self.data
+            if isinstance(self.data, dict)
+            else self.data.model_dump(),
+        }
@@ -0,0 +1,49 @@
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel
+
+from llama_index.core.schema import NodeWithScore
+from llama_index.core.workflow.events import Event
+from llama_index.server.utils.chat_file import get_file_url_from_metadata
+
+
+class SourceNodesEvent(Event):
+    nodes: List[NodeWithScore]
+
+    def to_response(self) -> dict:
+        return {
+            "type": "sources",
+            "data": {
+                "nodes": [
+                    SourceNodes.from_source_node(node).model_dump()
+                    for node in self.nodes
+                ]
+            },
+        }
+
+
+class SourceNodes(BaseModel):
+    id: str
+    metadata: Dict[str, Any]
+    score: Optional[float]
+    text: str
+    url: Optional[str]
+
+    @classmethod
+    def from_source_node(cls, source_node: NodeWithScore) -> "SourceNodes":
+        metadata = source_node.node.metadata
+        url = get_file_url_from_metadata(metadata)
+
+        return cls(
+            id=source_node.node.node_id,
+            metadata=metadata,
+            score=source_node.score,
+            text=source_node.node.text,  # type: ignore
+            url=url,
+        )
+
+    @classmethod
+    def from_source_nodes(
+        cls, source_nodes: List[NodeWithScore]
+    ) -> List["SourceNodes"]:
+        return [cls.from_source_node(node) for node in source_nodes]
@@ -0,0 +1,49 @@
+import logging
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel
+
+from llama_index.core.workflow import Event
+
+logger = logging.getLogger("uvicorn")
+
+
+class AgentRunEventType(Enum):
+    TEXT = "text"
+    PROGRESS = "progress"
+
+
+class AgentRunEvent(Event):
+    name: str
+    msg: str
+    event_type: AgentRunEventType = AgentRunEventType.TEXT
+    data: Optional[dict] = None
+
+    def to_response(self) -> dict:
+        return {
+            "type": "agent",
+            "data": {
+                "agent": self.name,
+                "type": self.event_type.value,
+                "text": self.msg,
+                "data": self.data,
+            },
+        }
+
+
+class ComponentDefinition(BaseModel):
+    type: str
+    code: str
+    filename: str
+
+
+class UIEvent(Event):
+    type: str
+    data: BaseModel
+
+    def to_response(self) -> dict:
+        return {
+            "type": self.type,
+            "data": self.data.model_dump(),
+        }
@@ -2,7 +2,7 @@ import logging
 import os
 from typing import List, Optional

-from llama_index.server.api.models import ComponentDefinition
+from llama_index.server.models.ui import ComponentDefinition


 class CustomUI:
@@ -11,7 +11,7 @@ from llama_cloud import ManagedIngestionStatus, PipelineFileCreateCustomMetadata
 from pydantic import BaseModel

 from llama_index.core.schema import NodeWithScore
-from llama_index.server.api.models import SourceNodes
+from llama_index.server.models.source_nodes import SourceNodes
 from llama_index.server.services.llamacloud.index import get_client
 from llama_index.server.utils import llamacloud

@@ -3,14 +3,15 @@ import os
 from typing import TYPE_CHECKING, Any, Optional

 from llama_cloud import PipelineType
+from pydantic import BaseModel, Field, field_validator
+
 from llama_index.core.callbacks import CallbackManager
 from llama_index.core.ingestion.api_utils import (
    get_client as llama_cloud_get_client,
 )
 from llama_index.core.settings import Settings
 from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
-from llama_index.server.api.models import ChatRequest
-from pydantic import BaseModel, Field, field_validator
+from llama_index.server.models.chat import ChatRequest

 if TYPE_CHECKING:
    from llama_cloud.client import LlamaCloud
@@ -5,7 +5,7 @@ from typing import List, Optional, Union

 from llama_index.core.prompts import PromptTemplate
 from llama_index.core.settings import Settings
-from llama_index.server.api.models import ChatAPIMessage
+from llama_index.server.models.chat import ChatAPIMessage
 from llama_index.server.prompts import SUGGEST_NEXT_QUESTION_PROMPT

 logger = logging.getLogger("uvicorn")
@@ -0,0 +1,106 @@
+import json
+import logging
+from pathlib import Path
+from typing import Type
+
+from llama_index.core.workflow import (
+    Context,
+    JsonSerializer,
+    Workflow,
+)
+from llama_index.server.models.hitl import HumanResponseEvent
+from llama_index.server.utils.class_meta_serialization import (
+    type_from_identifier,
+    type_identifier,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class HITLWorkflowService:
+    """
+    A service for helping pause and resume a HITL workflow.
+    """
+
+    # A key in context that stores the HITL event type
+    HITL_CONTEXT_KEY = "human_response_type"
+
+    @staticmethod
+    def get_storage_path(id: str) -> Path:
+        storage_dir = Path("output") / "checkpoints"
+        if not storage_dir.exists():
+            storage_dir.mkdir(parents=True, exist_ok=True)
+        return storage_dir / f"{id}.json"
+
+    @classmethod
+    async def save_context(
+        cls,
+        id: str,
+        ctx: Context,
+        resume_event_type: Type[HumanResponseEvent],
+    ) -> None:
+        """
+        Save the current checkpoint to a file and return the id
+
+        Args:
+            id: The id to save the context to.
+            ctx: The context to save.
+            resume_event_type [Optional]: Save workflow context with a resume event.
+        """
+        await ctx.set(
+            key=cls.HITL_CONTEXT_KEY,
+            value=type_identifier(resume_event_type),
+        )
+
+        ctx_data = ctx.to_dict(serializer=JsonSerializer())
+        with open(cls.get_storage_path(id), "w") as f:
+            json.dump(ctx_data, f)
+
+    @classmethod
+    async def load_context(
+        cls,
+        id: str,
+        workflow: Workflow,
+        data: dict,
+    ) -> Context:
+        file_path = cls.get_storage_path(id)
+        if not file_path.exists():
+            raise FileNotFoundError(f"No checkpoint found for id: {id}")
+        try:
+            with open(file_path, "r") as f:
+                ctx_data = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid checkpoint data for id {id}: {e}")
+        ctx = Context.from_dict(
+            workflow=workflow,
+            data=ctx_data,
+            serializer=JsonSerializer(),
+        )
+        resume_event = await cls._construct_resume_event(ctx, data)
+        ctx.send_event(resume_event)
+        return ctx
+
+    @classmethod
+    async def _construct_resume_event(
+        cls, context: Context, data: dict
+    ) -> HumanResponseEvent:
+        """
+        Get the HITL event from the context.
+        """
+        event_type_str = await context.get(cls.HITL_CONTEXT_KEY)
+        if not event_type_str:
+            raise ValueError(
+                "Cannot resume the workflow because there is no resume event type in the context"
+            )
+        resume_event_type = type_from_identifier(event_type_str)
+        if not issubclass(resume_event_type, HumanResponseEvent):
+            raise ValueError(
+                f"Cannot resume the workflow because the resume event type {resume_event_type} is not a HumanResponseEvent"
+            )
+        try:
+            return resume_event_type(**data)
+        except Exception as e:
+            raise ValueError(
+                f"Error constructing resume event: {e}. "
+                f"Make sure the provided data is valid for the event type {resume_event_type}"
+            )
@@ -14,7 +14,7 @@ from llama_index.core.tools import (
    ToolSelection,
 )
 from llama_index.core.workflow import Context
-from llama_index.server.api.models import AgentRunEvent, AgentRunEventType
+from llama_index.server.models.ui import AgentRunEvent, AgentRunEventType
 from llama_index.core.agent.workflow.workflow_events import ToolCall, ToolCallResult

 logger = logging.getLogger("uvicorn")
@@ -0,0 +1,36 @@
+import os
+from typing import Any, Dict, Optional
+
+from llama_index.server.settings import server_settings
+from llama_index.server.utils import llamacloud
+
+
+def get_file_url_from_metadata(
+    metadata: Dict[str, Any],
+    data_dir: Optional[str] = None,
+) -> Optional[str]:
+    """
+    Get the URL of a file from the source node metadata.
+    """
+    url_prefix = server_settings.file_server_url_prefix
+    if data_dir is None:
+        data_dir = "data"
+    file_name = metadata.get("file_name")
+
+    if file_name and url_prefix:
+        if llamacloud.is_llamacloud_file(metadata):
+            file_name = llamacloud.get_local_file_name(metadata)
+            return f"{url_prefix}/output/llamacloud/{file_name}"
+        is_private = metadata.get("private", "false") == "true"
+        if is_private:
+            # file is a private upload
+            return f"{url_prefix}/output/uploaded/{file_name}"
+        # file is from calling the 'generate' script
+        # Get the relative path of file_path to data_dir
+        file_path = metadata.get("file_path")
+        data_dir = os.path.abspath(data_dir)
+        if file_path and data_dir:
+            relative_path = os.path.relpath(file_path, data_dir)
+            return f"{url_prefix}/data/{relative_path}"
+    # fallback to URL in metadata (e.g. for websites)
+    return metadata.get("URL")
@@ -0,0 +1,30 @@
+# Helper functions for serializing and deserializing class metadata.
+import importlib
+from typing import Type
+
+
+def type_identifier(type: Type) -> str:
+    """
+    Get the identifier of a type.
+    """
+    return f"{type.__module__}.{type.__qualname__}"
+
+
+def type_from_identifier(identifier: str) -> Type:
+    """
+    Get the type from an identifier.
+    """
+    if not identifier or "." not in identifier:
+        raise ValueError(f"Invalid type identifier format: {identifier}")
+    try:
+        module, qualname = identifier.rsplit(".", 1)
+        imported_module = importlib.import_module(module)
+        if not hasattr(imported_module, qualname):
+            raise AttributeError(f"Module '{module}' has no attribute '{qualname}'")
+        return getattr(imported_module, qualname)
+    except ImportError as e:
+        raise ImportError(f"Failed to import module '{module}': {e}")
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to resolve type from identifier '{identifier}': {e}"
+        )
@@ -1,7 +1,7 @@
 {
  "name": "@create-llama/llama-index-server",
  "private": true,
-  "version": "0.1.18",
+  "version": "0.1.20",
  "type": "module",
  "scripts": {
    "prebuild": "uv run -- scripts/frontend.py --mode copy",
@@ -1,6 +1,6 @@
 [project]
 name = "llama-index-server"
-version = "0.1.18"
+version = "0.1.20"
 description = "llama-index fastapi server"
 readme = "README.md"
 license = "MIT"
@@ -1,4 +1,5 @@
 import logging
+from typing import AsyncGenerator, Callable
 from unittest.mock import AsyncMock, MagicMock

 import pytest
@@ -7,31 +8,32 @@ from httpx import ASGITransport, AsyncClient

 from llama_index.core.workflow import StopEvent, Workflow
 from llama_index.core.workflow.handler import WorkflowHandler
-from llama_index.server.api.models import ChatAPIMessage, ChatRequest
 from llama_index.server.api.routers.chat import chat_router
+from llama_index.server.models.chat import ChatAPIMessage, ChatRequest, MessageRole


@pytest.fixture()
-def logger():
+def logger() -> logging.Logger:
    return logging.getLogger("test")


@pytest.fixture()
-def chat_request():
+def chat_request() -> ChatRequest:
    """Create a simple chat request with one user message."""
    return ChatRequest(
-        messages=[ChatAPIMessage(role="user", content="Hello, how are you?")]
+        id="test",
+        messages=[ChatAPIMessage(role=MessageRole.USER, content="Hello, how are you?")],
    )


@pytest.fixture()
-def mock_workflow():
+def mock_workflow() -> MagicMock:
    """Create a mock workflow that returns a simple response."""
    workflow = MagicMock(spec=Workflow)
    handler = AsyncMock(spec=WorkflowHandler)

    # Setup the handler to stream a simple response event
-    async def mock_stream_events():
+    async def mock_stream_events() -> AsyncGenerator[StopEvent, None]:
        yield StopEvent(result="I'm doing well, thank you for asking!")

    handler.stream_events.return_value = mock_stream_events()
@@ -41,17 +43,21 @@ def mock_workflow():


@pytest.fixture()
-def workflow_factory(mock_workflow):
+def workflow_factory(mock_workflow: MagicMock) -> Callable[[], MagicMock]:
    """Create a factory function that returns our mock workflow."""

-    def factory(verbose=False):
+    def factory(verbose: bool = False) -> MagicMock:
        return mock_workflow

    return factory


@pytest.mark.asyncio()
-async def test_chat_router(chat_request, workflow_factory, logger):
+async def test_chat_router(
+    chat_request: ChatRequest,
+    workflow_factory: Callable[[], MagicMock],
+    logger: logging.Logger,
+) -> None:
    """Test that the chat router handles a request correctly."""
    # Create a FastAPI app and mount our router
    app = FastAPI()
@@ -90,14 +96,14 @@ async def test_chat_router(chat_request, workflow_factory, logger):


@pytest.mark.asyncio()
-async def test_chat_with_agent_workflow(logger):
+async def test_chat_with_agent_workflow(logger: logging.Logger) -> None:
    """Test that the chat router works with a workflow that mimics an agent workflow."""
    # Create a simple workflow that mimics an agent workflow
    mock_workflow = MagicMock(spec=Workflow)
    handler = AsyncMock(spec=WorkflowHandler)

    # Setup the handler to stream a simple response about weather
-    async def mock_stream_events():
+    async def mock_stream_events() -> AsyncGenerator[StopEvent, None]:
        yield StopEvent(
            result="The weather in New York is sunny. I used the weather tool to get this information."
        )
@@ -106,7 +112,7 @@ async def test_chat_with_agent_workflow(logger):
    mock_workflow.run.return_value = handler

    # Create a factory function that returns our mock workflow
-    def workflow_factory(verbose=False):
+    def workflow_factory(verbose: bool = False) -> MagicMock:
        return mock_workflow

    # Create a FastAPI app and mount our router
@@ -116,9 +122,12 @@ async def test_chat_with_agent_workflow(logger):

    # Create a chat request asking about weather
    chat_request = ChatRequest(
+        id="test",
        messages=[
-            ChatAPIMessage(role="user", content="What's the weather in New York?")
-        ]
+            ChatAPIMessage(
+                role=MessageRole.USER, content="What's the weather in New York?"
+            )
+        ],
    )

    # Make a request to the chat endpoint
@@ -9,9 +9,9 @@ from llama_index.core.agent.workflow.workflow_events import AgentStream
 from llama_index.core.types import MessageRole
 from llama_index.core.workflow import StopEvent
 from llama_index.core.workflow.handler import WorkflowHandler
-from llama_index.server.api.models import ChatAPIMessage, ChatRequest
 from llama_index.server.api.routers.chat import _stream_content
 from llama_index.server.api.utils.vercel_stream import VercelStreamResponse
+from llama_index.server.models.chat import ChatAPIMessage, ChatRequest


@pytest.fixture()
@@ -22,7 +22,8 @@ def logger() -> logging.Logger:
@pytest.fixture()
 def chat_request() -> ChatRequest:
    return ChatRequest(
-        messages=[ChatAPIMessage(role=MessageRole.USER, content="test message")]
+        id="test",
+        messages=[ChatAPIMessage(role=MessageRole.USER, content="test message")],
    )


@@ -50,7 +51,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -75,7 +76,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -99,7 +100,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -124,7 +125,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -148,7 +149,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -171,7 +172,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -196,7 +197,7 @@ class TestEventStream:
        result = [
            chunk
            async for chunk in _stream_content(
-                mock_workflow_handler, chat_request, logger
+                mock_workflow_handler, logger, chat_request.id
            )
        ]

@@ -1936,7 +1936,7 @@ wheels = [

 [[package]]
 name = "llama-index-server"
-version = "0.1.17"
+version = "0.1.19"
 source = { editable = "." }
 dependencies = [
    { name = "cachetools" },
Author	SHA1	Message	Date
github-actions[bot]	bc56fa3c5f	Release 0.5.20 (#671 ) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>	2025-06-02 18:02:05 +07:00
Huu Le	087c96164d	feat: [server] Add Human in the Loop example with FastAPI integration (#630 )	2025-06-02 17:47:04 +07:00
Thuc Pham	3ff0a18876	fix: default header padding (#672 )	2025-05-31 14:08:29 +07:00
Thuc Pham	df1047480a	fix: missing cursor pointer for button (#670 )	2025-05-30 09:52:17 +07:00
Marcus Schiesser	8d89223a08	chore: fill empty chat message default	2025-05-29 21:05:53 +07:00
github-actions[bot]	49a944182f	Release 0.2.5 (#669 ) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>	2025-05-29 13:06:58 +07:00
Marcus Schiesser	058b3762c1	fix: update generate script path for ejected project (#668 )	2025-05-29 12:21:17 +07:00
Thuc Pham	4c8579b04f	use eject file in linux (#663 )	2025-05-29 09:15:52 +07:00