This commit is contained in:
Harrison Chase
2025-03-10 18:07:50 -07:00
parent f3d2f05243
commit 5a858d1ecd
3 changed files with 65 additions and 52 deletions
+13 -8
View File
@@ -45,7 +45,7 @@ In this example, the reflection agent uses another LLM to judge its output. The
Installation:
```
pip install langgraph-reflection langchain
pip install langgraph-reflection langchain openevals
```
Example usage:
@@ -56,15 +56,20 @@ assistant_graph = ...
# Define the judge function that evaluates responses
def judge_response(state, config):
"""Evaluate the assistant's response using a separate judge model."""
judge_model = init_chat_model(...).bind_tools([Finish])
response = judge_model.invoke([...])
# If the judge called Finish, response is approved
if len(response.tool_calls) == 1:
evaluator = create_llm_as_judge(
prompt=critique_prompt,
model="openai:o3-mini",
feedback_key="pass",
)
eval_result = evaluator(outputs=state["messages"][-1].content, inputs=None)
if eval_result["score"]:
print("✅ Response approved by judge")
return
else:
# Return judge's critique as a new user message
return {"messages": [{"role": "user", "content": response.content}]}
# Otherwise, return the judge's critique as a new user message
print("⚠️ Judge requested improvements")
return {"messages": [{"role": "user", "content": eval_result["comment"]}]}
# Create graphs with reflection
judge_graph = StateGraph(MessagesState).add_node(judge_response)...
+35 -29
View File
@@ -20,35 +20,36 @@ from langgraph_reflection import create_reflection_graph
def analyze_with_pyright(code_string: str) -> dict:
"""Analyze Python code using Pyright for static type checking and errors.
Args:
code_string: The Python code to analyze as a string
Returns:
dict: The Pyright analysis results
"""
with tempfile.NamedTemporaryFile(suffix='.py', mode='w', delete=False) as temp:
with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as temp:
temp.write(code_string)
temp_path = temp.name
try:
result = subprocess.run(
[
"pyright",
"--outputjson",
"--level", "error", # Only report errors, not warnings
temp_path
"--level",
"error", # Only report errors, not warnings
temp_path,
],
capture_output=True,
text=True
text=True,
)
try:
return json.loads(result.stdout)
except json.JSONDecodeError:
return {
"error": "Failed to parse Pyright output",
"raw_output": result.stdout
"raw_output": result.stdout,
}
finally:
os.unlink(temp_path)
@@ -56,10 +57,10 @@ def analyze_with_pyright(code_string: str) -> dict:
def call_model(state: dict) -> dict:
"""Process the user query with a Claude 3 Sonnet model.
Args:
state: The current conversation state
Returns:
dict: Updated state with model response
"""
@@ -70,11 +71,13 @@ def call_model(state: dict) -> dict:
# Define type classes for code extraction
class ExtractPythonCode(TypedDict):
"""Type class for extracting Python code. The python_code field is the code to be extracted."""
python_code: str
class NoCode(TypedDict):
"""Type class for indicating no code was found."""
no_code: bool
@@ -90,35 +93,39 @@ If there is no code to extract - call NoCode."""
def try_running(state: dict) -> dict | None:
"""Attempt to run and analyze the extracted Python code.
Args:
state: The current conversation state
Returns:
dict | None: Updated state with analysis results if code was found
"""
model = init_chat_model(model="o3-mini")
extraction = model.bind_tools([ExtractPythonCode, NoCode])
er = extraction.invoke([{"role": "system", "content": SYSTEM_PROMPT}] + state['messages'])
er = extraction.invoke(
[{"role": "system", "content": SYSTEM_PROMPT}] + state["messages"]
)
if len(er.tool_calls) == 0:
return None
tc = er.tool_calls[0]
if tc['name'] != "ExtractPythonCode":
if tc["name"] != "ExtractPythonCode":
return None
result = analyze_with_pyright(tc['args']['python_code'])
result = analyze_with_pyright(tc["args"]["python_code"])
print(result)
explanation = result['generalDiagnostics']
if result['summary']['errorCount']:
explanation = result["generalDiagnostics"]
if result["summary"]["errorCount"]:
return {
"messages": [{
"role": "user",
"content": f"I ran pyright and found this: {explanation}\n\n"
"Try to fix it. Make sure to regenerate the entire code snippet. "
"If you are not sure what is wrong, or think there is a mistake, "
"you can ask me a question rather than generating code"
}]
"messages": [
{
"role": "user",
"content": f"I ran pyright and found this: {explanation}\n\n"
"Try to fix it. Make sure to regenerate the entire code snippet. "
"If you are not sure what is wrong, or think there is a mistake, "
"you can ask me a question rather than generating code",
}
]
}
@@ -146,7 +153,6 @@ def create_graphs():
return create_reflection_graph(assistant_graph, judge_graph).compile()
if __name__ == "__main__":
"""Run an example query through the reflection system."""
example_query = [
@@ -159,4 +165,4 @@ if __name__ == "__main__":
print("Running example with reflection...")
reflection_app = create_graphs()
result = reflection_app.invoke({"messages": example_query})
print("Result:", result)
print("Result:", result)
+17 -15
View File
@@ -3,13 +3,15 @@
Should install:
```
pip install langgraph-reflection langchain
pip install langgraph-reflection langchain openevals
```
"""
from langgraph_reflection import create_reflection_graph
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph, MessagesState, START, END
from typing import TypedDict
from openevals.llm import create_llm_as_judge
# Define the main assistant model that will generate responses
@@ -46,34 +48,34 @@ Evaluate the response based on these criteria:
4. Helpfulness - Does it provide actionable and useful information?
5. Safety - Does it avoid harmful or inappropriate content?
If the response meets ALL criteria satisfactorily, call the `Finish` tool to approve it.
If the response meets ALL criteria satisfactorily, set pass to True.
If you find ANY issues with the response, do NOT call the Finish tool. Instead, provide specific and constructive feedback about what needs to be improved, and your response will be sent back to the assistant as a follow-up query.
If you find ANY issues with the response, do NOT set pass to True. Instead, provide specific and constructive feedback in the comment key and set pass to False.
Be detailed in your critique so the assistant can understand exactly how to improve."""
Be detailed in your critique so the assistant can understand exactly how to improve.
<response>
{outputs}
</response>"""
# Define the judge function with a more robust evaluation approach
def judge_response(state, config):
"""Evaluate the assistant's response using a separate judge model."""
# Use a different model as the judge (can be smaller/more efficient)
judge_model = init_chat_model(model="o3-mini", model_provider="openai").bind_tools(
[Finish]
evaluator = create_llm_as_judge(
prompt=critique_prompt,
model="openai:o3-mini",
feedback_key="pass",
)
eval_result = evaluator(outputs=state["messages"][-1].content, inputs=None)
# Create judge prompt with all messages for context
response = judge_model.invoke(
[{"role": "system", "content": critique_prompt}] + state["messages"]
)
# If the judge called the Finish tool, the response is approved
if len(response.tool_calls) == 1:
if eval_result["score"]:
print("✅ Response approved by judge")
return
else:
# Otherwise, return the judge's critique as a new user message
print("⚠️ Judge requested improvements")
return {"messages": [{"role": "user", "content": response.content}]}
return {"messages": [{"role": "user", "content": eval_result["comment"]}]}
# Define the judge graph