mirror of
https://github.com/langchain-ai/company-researcher.git
synced 2026-07-01 21:54:03 -04:00
expose raw search results in the output
This commit is contained in:
@@ -12,6 +12,9 @@ class Configuration:
|
||||
max_search_queries: int = 3 # Max search queries per company
|
||||
max_search_results: int = 3 # Max search results per query
|
||||
max_reflection_steps: int = 0 # Max reflection steps
|
||||
include_search_results: bool = (
|
||||
False # Whether to include search results in the output
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_runnable_config(
|
||||
|
||||
+11
-4
@@ -11,7 +11,7 @@ from pydantic import BaseModel, Field
|
||||
|
||||
from agent.configuration import Configuration
|
||||
from agent.state import InputState, OutputState, OverallState
|
||||
from agent.utils import deduplicate_and_format_sources, format_all_notes
|
||||
from agent.utils import deduplicate_sources, format_sources, format_all_notes
|
||||
from agent.prompts import (
|
||||
EXTRACTION_PROMPT,
|
||||
REFLECTION_PROMPT,
|
||||
@@ -120,8 +120,9 @@ async def research_company(
|
||||
search_docs = await asyncio.gather(*search_tasks)
|
||||
|
||||
# Deduplicate and format sources
|
||||
source_str = deduplicate_and_format_sources(
|
||||
search_docs, max_tokens_per_source=1000, include_raw_content=True
|
||||
deduplicated_search_docs = deduplicate_sources(search_docs)
|
||||
source_str = format_sources(
|
||||
deduplicated_search_docs, max_tokens_per_source=1000, include_raw_content=True
|
||||
)
|
||||
|
||||
# Generate structured notes relevant to the extraction schema
|
||||
@@ -132,7 +133,13 @@ async def research_company(
|
||||
user_notes=state.user_notes,
|
||||
)
|
||||
result = await claude_3_5_sonnet.ainvoke(p)
|
||||
return {"completed_notes": [str(result.content)]}
|
||||
state_update = {
|
||||
"completed_notes": [str(result.content)],
|
||||
}
|
||||
if configurable.include_search_results:
|
||||
state_update["search_results"] = deduplicated_search_docs
|
||||
|
||||
return state_update
|
||||
|
||||
|
||||
def gather_notes_extract_schema(state: OverallState) -> dict[str, Any]:
|
||||
|
||||
@@ -68,6 +68,9 @@ class OverallState:
|
||||
search_queries: list[str] = field(default=None)
|
||||
"List of generated search queries to find relevant information"
|
||||
|
||||
search_results: list[dict] = field(default=None)
|
||||
"List of search results"
|
||||
|
||||
completed_notes: Annotated[list, operator.add] = field(default_factory=list)
|
||||
"Notes from completed research related to the schema"
|
||||
|
||||
@@ -99,3 +102,6 @@ class OutputState:
|
||||
based on the user's query and the graph's execution.
|
||||
This is the primary output of the enrichment process.
|
||||
"""
|
||||
|
||||
search_results: list[dict] = field(default=None)
|
||||
"List of search results"
|
||||
|
||||
+29
-10
@@ -1,10 +1,6 @@
|
||||
def deduplicate_and_format_sources(
|
||||
search_response, max_tokens_per_source, include_raw_content=True
|
||||
):
|
||||
def deduplicate_sources(search_response: dict | list[dict]) -> list[dict]:
|
||||
"""
|
||||
Takes either a single search response or list of responses from Tavily API and formats them.
|
||||
Limits the raw_content to approximately max_tokens_per_source.
|
||||
include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
|
||||
Takes either a single search response or list of responses from Tavily API and de-duplicates them based on the URL.
|
||||
|
||||
Args:
|
||||
search_response: Either:
|
||||
@@ -30,14 +26,37 @@ def deduplicate_and_format_sources(
|
||||
)
|
||||
|
||||
# Deduplicate by URL
|
||||
unique_sources = {}
|
||||
unique_urls = set()
|
||||
unique_sources_list = []
|
||||
for source in sources_list:
|
||||
if source["url"] not in unique_sources:
|
||||
unique_sources[source["url"]] = source
|
||||
if source["url"] not in unique_urls:
|
||||
unique_urls.add(source["url"])
|
||||
unique_sources_list.append(source)
|
||||
|
||||
return unique_sources_list
|
||||
|
||||
|
||||
def format_sources(
|
||||
sources_list: list[dict],
|
||||
include_raw_content: bool = True,
|
||||
max_tokens_per_source: int = 1000,
|
||||
) -> str:
|
||||
"""
|
||||
Takes a list of unique results from Tavily API and formats them.
|
||||
Limits the raw_content to approximately max_tokens_per_source.
|
||||
include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
|
||||
|
||||
Args:
|
||||
sources_list: list of unique results from Tavily API
|
||||
max_tokens_per_source: int, maximum number of tokens per each search result to include in the formatted string
|
||||
include_raw_content: bool, whether to include the raw_content from Tavily in the formatted string
|
||||
|
||||
Returns:
|
||||
str: Formatted string with deduplicated sources
|
||||
"""
|
||||
# Format output
|
||||
formatted_text = "Sources:\n\n"
|
||||
for i, source in enumerate(unique_sources.values(), 1):
|
||||
for source in sources_list:
|
||||
formatted_text += f"Source {source['title']}:\n===\n"
|
||||
formatted_text += f"URL: {source['url']}\n===\n"
|
||||
formatted_text += (
|
||||
|
||||
Reference in New Issue
Block a user