cr

2026-07-01 16:00:21 -04:00 · 2025-03-10 18:07:50 -07:00
parent f3d2f05243
commit 5a858d1ecd
3 changed files with 65 additions and 52 deletions
@@ -45,7 +45,7 @@ In this example, the reflection agent uses another LLM to judge its output. The
 Installation:

 ```
-pip install langgraph-reflection langchain
+pip install langgraph-reflection langchain openevals
 ```

 Example usage:
@@ -56,15 +56,20 @@ assistant_graph = ...
 # Define the judge function that evaluates responses
 def judge_response(state, config):
    """Evaluate the assistant's response using a separate judge model."""
-    judge_model = init_chat_model(...).bind_tools([Finish])
-    response = judge_model.invoke([...])
-    
-    # If the judge called Finish, response is approved
-    if len(response.tool_calls) == 1:
+    evaluator = create_llm_as_judge(   
+        prompt=critique_prompt,
+        model="openai:o3-mini",
+        feedback_key="pass",
+    )
+    eval_result = evaluator(outputs=state["messages"][-1].content, inputs=None)
+
+    if eval_result["score"]:
+        print("✅ Response approved by judge")
        return
    else:
-        # Return judge's critique as a new user message
-        return {"messages": [{"role": "user", "content": response.content}]}
+        # Otherwise, return the judge's critique as a new user message
+        print("⚠️ Judge requested improvements")
+        return {"messages": [{"role": "user", "content": eval_result["comment"]}]}

 # Create graphs with reflection
 judge_graph = StateGraph(MessagesState).add_node(judge_response)...
@@ -20,35 +20,36 @@ from langgraph_reflection import create_reflection_graph

 def analyze_with_pyright(code_string: str) -> dict:
    """Analyze Python code using Pyright for static type checking and errors.
-    
+
    Args:
        code_string: The Python code to analyze as a string
-        
+
    Returns:
        dict: The Pyright analysis results
    """
-    with tempfile.NamedTemporaryFile(suffix='.py', mode='w', delete=False) as temp:
+    with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as temp:
        temp.write(code_string)
        temp_path = temp.name
-    
+
    try:
        result = subprocess.run(
            [
                "pyright",
                "--outputjson",
-                "--level", "error",  # Only report errors, not warnings
-                temp_path
+                "--level",
+                "error",  # Only report errors, not warnings
+                temp_path,
            ],
            capture_output=True,
-            text=True
+            text=True,
        )
-        
+
        try:
            return json.loads(result.stdout)
        except json.JSONDecodeError:
            return {
                "error": "Failed to parse Pyright output",
-                "raw_output": result.stdout
+                "raw_output": result.stdout,
            }
    finally:
        os.unlink(temp_path)
@@ -56,10 +57,10 @@ def analyze_with_pyright(code_string: str) -> dict:

 def call_model(state: dict) -> dict:
    """Process the user query with a Claude 3 Sonnet model.
-    
+
    Args:
        state: The current conversation state
-        
+
    Returns:
        dict: Updated state with model response
    """
@@ -70,11 +71,13 @@ def call_model(state: dict) -> dict:
 # Define type classes for code extraction
 class ExtractPythonCode(TypedDict):
    """Type class for extracting Python code. The python_code field is the code to be extracted."""
+
    python_code: str


 class NoCode(TypedDict):
    """Type class for indicating no code was found."""
+
    no_code: bool


@@ -90,35 +93,39 @@ If there is no code to extract - call NoCode."""

 def try_running(state: dict) -> dict | None:
    """Attempt to run and analyze the extracted Python code.
-    
+
    Args:
        state: The current conversation state
-        
+
    Returns:
        dict | None: Updated state with analysis results if code was found
    """
    model = init_chat_model(model="o3-mini")
    extraction = model.bind_tools([ExtractPythonCode, NoCode])
-    er = extraction.invoke([{"role": "system", "content": SYSTEM_PROMPT}] + state['messages'])
+    er = extraction.invoke(
+        [{"role": "system", "content": SYSTEM_PROMPT}] + state["messages"]
+    )
    if len(er.tool_calls) == 0:
        return None
    tc = er.tool_calls[0]
-    if tc['name'] != "ExtractPythonCode":
+    if tc["name"] != "ExtractPythonCode":
        return None
-        
-    result = analyze_with_pyright(tc['args']['python_code'])
+
+    result = analyze_with_pyright(tc["args"]["python_code"])
    print(result)
-    explanation = result['generalDiagnostics']
-    
-    if result['summary']['errorCount']:
+    explanation = result["generalDiagnostics"]
+
+    if result["summary"]["errorCount"]:
        return {
-            "messages": [{
-                "role": "user",
-                "content": f"I ran pyright and found this: {explanation}\n\n"
-                          "Try to fix it. Make sure to regenerate the entire code snippet. "
-                          "If you are not sure what is wrong, or think there is a mistake, "
-                          "you can ask me a question rather than generating code"
-            }]
+            "messages": [
+                {
+                    "role": "user",
+                    "content": f"I ran pyright and found this: {explanation}\n\n"
+                    "Try to fix it. Make sure to regenerate the entire code snippet. "
+                    "If you are not sure what is wrong, or think there is a mistake, "
+                    "you can ask me a question rather than generating code",
+                }
+            ]
        }


@@ -146,7 +153,6 @@ def create_graphs():
    return create_reflection_graph(assistant_graph, judge_graph).compile()


-
 if __name__ == "__main__":
    """Run an example query through the reflection system."""
    example_query = [
@@ -159,4 +165,4 @@ if __name__ == "__main__":
    print("Running example with reflection...")
    reflection_app = create_graphs()
    result = reflection_app.invoke({"messages": example_query})
-    print("Result:", result)
+    print("Result:", result)
@@ -3,13 +3,15 @@
 Should install:

 ```
-pip install langgraph-reflection langchain
+pip install langgraph-reflection langchain openevals
 ```
 """
+
 from langgraph_reflection import create_reflection_graph
 from langchain.chat_models import init_chat_model
 from langgraph.graph import StateGraph, MessagesState, START, END
 from typing import TypedDict
+from openevals.llm import create_llm_as_judge


 # Define the main assistant model that will generate responses
@@ -46,34 +48,34 @@ Evaluate the response based on these criteria:
 4. Helpfulness - Does it provide actionable and useful information?
 5. Safety - Does it avoid harmful or inappropriate content?

-If the response meets ALL criteria satisfactorily, call the `Finish` tool to approve it.
+If the response meets ALL criteria satisfactorily, set pass to True.

-If you find ANY issues with the response, do NOT call the Finish tool. Instead, provide specific and constructive feedback about what needs to be improved, and your response will be sent back to the assistant as a follow-up query.
+If you find ANY issues with the response, do NOT set pass to True. Instead, provide specific and constructive feedback in the comment key and set pass to False.

-Be detailed in your critique so the assistant can understand exactly how to improve."""
+Be detailed in your critique so the assistant can understand exactly how to improve.
+
+<response>
+{outputs}
+</response>"""


 # Define the judge function with a more robust evaluation approach
 def judge_response(state, config):
    """Evaluate the assistant's response using a separate judge model."""
-    # Use a different model as the judge (can be smaller/more efficient)
-    judge_model = init_chat_model(model="o3-mini", model_provider="openai").bind_tools(
-        [Finish]
+    evaluator = create_llm_as_judge(
+        prompt=critique_prompt,
+        model="openai:o3-mini",
+        feedback_key="pass",
    )
+    eval_result = evaluator(outputs=state["messages"][-1].content, inputs=None)

-    # Create judge prompt with all messages for context
-    response = judge_model.invoke(
-        [{"role": "system", "content": critique_prompt}] + state["messages"]
-    )
-
-    # If the judge called the Finish tool, the response is approved
-    if len(response.tool_calls) == 1:
+    if eval_result["score"]:
        print("✅ Response approved by judge")
        return
    else:
        # Otherwise, return the judge's critique as a new user message
        print("⚠️ Judge requested improvements")
-        return {"messages": [{"role": "user", "content": response.content}]}
+        return {"messages": [{"role": "user", "content": eval_result["comment"]}]}


 # Define the judge graph