mirror of
https://github.com/run-llama/semtools.git
synced 2026-07-01 21:34:14 -04:00
feat: move separate commands to a unified interace
This commit is contained in:
+3
-18
@@ -11,24 +11,9 @@ keywords = ["semantic-search", "document-parsing", "cli", "pdf", "search"]
|
||||
readme = "README.md"
|
||||
|
||||
[[bin]]
|
||||
name = "parse"
|
||||
path = "src/bin/parse.rs"
|
||||
required-features = ["parse"]
|
||||
|
||||
[[bin]]
|
||||
name = "search"
|
||||
path = "src/bin/search.rs"
|
||||
required-features = ["search"]
|
||||
|
||||
[[bin]]
|
||||
name = "workspace"
|
||||
path = "src/bin/workspace.rs"
|
||||
required-features = ["workspace", "search"]
|
||||
|
||||
[[bin]]
|
||||
name = "ask"
|
||||
path = "src/bin/ask.rs"
|
||||
required-features = ["ask", "search"]
|
||||
name = "semtools"
|
||||
path = "src/bin/semtools.rs"
|
||||
required-features = ["ask", "search", "workspace", "parse"]
|
||||
|
||||
[dependencies]
|
||||
# Common dependencies
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use std::path::Path;
|
||||
|
||||
use semtools::{LlamaParseBackend, SemtoolsConfig};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about = "A CLI tool for parsing documents using various backends", long_about = None)]
|
||||
struct Args {
|
||||
/// Path to the config file. Defaults to ~/.semtools_config.json
|
||||
#[clap(short = 'c', long)]
|
||||
config: Option<String>,
|
||||
|
||||
/// The backend type to use for parsing. Defaults to `llama-parse`
|
||||
#[clap(short, long, default_value = "llama-parse")]
|
||||
backend: String,
|
||||
|
||||
/// Files to parse
|
||||
#[clap(required = true)]
|
||||
files: Vec<String>,
|
||||
|
||||
/// Verbose output while parsing
|
||||
#[clap(short, long)]
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
// Get config file path
|
||||
let config_path = args
|
||||
.config
|
||||
.unwrap_or_else(SemtoolsConfig::default_config_path);
|
||||
|
||||
// Load configuration
|
||||
let semtools_config = SemtoolsConfig::from_config_file(&config_path)?;
|
||||
let parse_config = semtools_config.parse.unwrap_or_default();
|
||||
|
||||
// Validate that files exist
|
||||
for file in &args.files {
|
||||
if !Path::new(file).exists() {
|
||||
eprintln!("Warning: File does not exist: {file}");
|
||||
}
|
||||
}
|
||||
|
||||
// Create backend and process files
|
||||
match args.backend.as_str() {
|
||||
"llama-parse" => {
|
||||
let backend = LlamaParseBackend::new(parse_config, args.verbose)?;
|
||||
let results = backend.parse(args.files).await?;
|
||||
|
||||
// Output the paths to parsed files, one per line
|
||||
for result_path in results {
|
||||
println!("{result_path}");
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
eprintln!(
|
||||
"Error: Unknown backend '{}'. Supported backends: llama-parse",
|
||||
args.backend
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use semtools::cmds::ask::ask_cmd;
|
||||
use semtools::cmds::parse::parse_cmd;
|
||||
use semtools::cmds::search::search_cmd;
|
||||
use semtools::cmds::workspace::{workspace_prune_cmd, workspace_status_cmd, workspace_use_cmd};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
struct SemtoolsArgs {
|
||||
#[command(subcommand)]
|
||||
cmd: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum WorkspaceCommands {
|
||||
/// Use or create a workspace (prints export command to run)
|
||||
Use { name: String },
|
||||
/// Show active workspace and basic stats
|
||||
Status,
|
||||
/// Remove stale or missing files from store
|
||||
Prune {},
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Commands {
|
||||
#[cfg(feature = "parse")]
|
||||
/// A CLI tool for parsing documents using various backends
|
||||
Parse {
|
||||
/// Path to the config file. Defaults to ~/.semtools_config.json
|
||||
#[clap(short = 'c', long)]
|
||||
config: Option<String>,
|
||||
|
||||
/// The backend type to use for parsing. Defaults to `llama-parse`
|
||||
#[clap(short, long, default_value = "llama-parse")]
|
||||
backend: String,
|
||||
|
||||
/// Files to parse
|
||||
#[clap(required = true)]
|
||||
files: Vec<String>,
|
||||
|
||||
/// Verbose output while parsing
|
||||
#[clap(short, long)]
|
||||
verbose: bool,
|
||||
},
|
||||
#[cfg(feature = "search")]
|
||||
/// A CLI tool for fast semantic keyword search
|
||||
Search {
|
||||
/// Query to search for (positional argument)
|
||||
query: String,
|
||||
|
||||
/// Files to search (positional arguments, optional if using stdin)
|
||||
#[arg(help = "Files to search, optional if using stdin")]
|
||||
files: Vec<String>,
|
||||
|
||||
/// How many lines before/after to return as context
|
||||
#[arg(short = 'n', long = "n-lines", alias = "context", default_value_t = 3)]
|
||||
n_lines: usize,
|
||||
|
||||
/// The top-k files or texts to return (ignored if max_distance is set)
|
||||
#[arg(long, default_value_t = 3)]
|
||||
top_k: usize,
|
||||
|
||||
/// Return all results with distance below this threshold (0.0+)
|
||||
#[arg(short = 'm', long = "max-distance", alias = "threshold")]
|
||||
max_distance: Option<f64>,
|
||||
|
||||
/// Perform case-insensitive search (default is false)
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
ignore_case: bool,
|
||||
|
||||
/// Output results in JSON format
|
||||
#[clap(short, long)]
|
||||
json: bool,
|
||||
},
|
||||
#[cfg(feature = "ask")]
|
||||
/// A CLI tool for document-based question-answering
|
||||
Ask {
|
||||
/// Query to prompt the agent with
|
||||
query: String,
|
||||
|
||||
/// Files to search (positional arguments, optional if using stdin)
|
||||
#[arg(help = "Files to search, optional if using stdin")]
|
||||
files: Vec<String>,
|
||||
|
||||
/// Path to the config file. Defaults to ~/.semtools_config.json
|
||||
#[clap(short = 'c', long)]
|
||||
config: Option<String>,
|
||||
|
||||
/// OpenAI API key (overrides config file and env var)
|
||||
#[clap(long)]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// OpenAI base URL (overrides config file)
|
||||
#[clap(long)]
|
||||
base_url: Option<String>,
|
||||
|
||||
/// Model to use for the agent (overrides config file)
|
||||
#[clap(short, long)]
|
||||
model: Option<String>,
|
||||
|
||||
/// API mode to use: 'chat' or 'responses' (overrides config file)
|
||||
#[clap(long)]
|
||||
api_mode: Option<String>,
|
||||
|
||||
/// Output results in JSON or text format
|
||||
#[clap(short, long)]
|
||||
json: bool,
|
||||
},
|
||||
#[cfg(feature = "workspace")]
|
||||
/// Manage semtools workspaces
|
||||
Workspace {
|
||||
/// Output results in JSON format
|
||||
#[clap(short, long, global = true)]
|
||||
json: bool,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: WorkspaceCommands,
|
||||
},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let args = SemtoolsArgs::parse();
|
||||
match args.cmd {
|
||||
Commands::Ask {
|
||||
query,
|
||||
files,
|
||||
config,
|
||||
api_key,
|
||||
base_url,
|
||||
model,
|
||||
api_mode,
|
||||
json,
|
||||
} => {
|
||||
ask_cmd(
|
||||
query, files, config, api_key, base_url, model, api_mode, json,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Commands::Parse {
|
||||
config,
|
||||
backend,
|
||||
files,
|
||||
verbose,
|
||||
} => {
|
||||
parse_cmd(config, backend, files, verbose).await?;
|
||||
}
|
||||
Commands::Search {
|
||||
query,
|
||||
files,
|
||||
n_lines,
|
||||
top_k,
|
||||
max_distance,
|
||||
ignore_case,
|
||||
json,
|
||||
} => {
|
||||
search_cmd(
|
||||
query,
|
||||
files,
|
||||
n_lines,
|
||||
top_k,
|
||||
max_distance,
|
||||
ignore_case,
|
||||
json,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Commands::Workspace { json, command } => match command {
|
||||
WorkspaceCommands::Use { name } => {
|
||||
workspace_use_cmd(name, json).await?;
|
||||
}
|
||||
WorkspaceCommands::Prune {} => {
|
||||
workspace_prune_cmd(json).await?;
|
||||
}
|
||||
WorkspaceCommands::Status => {
|
||||
workspace_status_cmd(json).await?;
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,200 +0,0 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Parser, Subcommand};
|
||||
|
||||
#[cfg(feature = "workspace")]
|
||||
use semtools::workspace::{Workspace, WorkspaceConfig, store::Store};
|
||||
|
||||
use semtools::json_mode::{PruneOutput, WorkspaceOutput};
|
||||
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
use semtools::json_mode::ErrorOutput;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about = "Manage semtools workspaces", long_about = None)]
|
||||
struct Args {
|
||||
/// Output results in JSON format
|
||||
#[clap(short, long, global = true)]
|
||||
json: bool,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Commands {
|
||||
/// Use or create a workspace (prints export command to run)
|
||||
Use { name: String },
|
||||
/// Show active workspace and basic stats
|
||||
Status,
|
||||
/// Remove stale or missing files from store
|
||||
Prune {},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
match args.command {
|
||||
Commands::Use { name } => {
|
||||
#[cfg(feature = "workspace")]
|
||||
{
|
||||
// Initialize new workspace configuration
|
||||
let ws = Workspace {
|
||||
config: WorkspaceConfig {
|
||||
name: name.clone(),
|
||||
root_dir: Workspace::root_path(&name)?,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
ws.save()?;
|
||||
|
||||
if args.json {
|
||||
// Try to get document count from store, or use 0 for new workspace
|
||||
let total_documents = if let Ok(store) = Store::open(&ws.config.root_dir) {
|
||||
if let Ok(stats) = store.get_stats() {
|
||||
stats.total_documents
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let output = WorkspaceOutput {
|
||||
name: ws.config.name.clone(),
|
||||
root_dir: ws.config.root_dir.clone(),
|
||||
total_documents,
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else {
|
||||
println!("Workspace '{name}' configured.");
|
||||
println!("To activate it, run:");
|
||||
println!(" export SEMTOOLS_WORKSPACE={name}");
|
||||
println!();
|
||||
println!("Or add this to your shell profile (.bashrc, .zshrc, etc.)");
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
if args.json {
|
||||
let error_output = ErrorOutput {
|
||||
error: "workspace feature not enabled".to_string(),
|
||||
error_type: "FeatureNotEnabled".to_string(),
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&error_output)?;
|
||||
eprintln!("{}", json_output);
|
||||
} else {
|
||||
println!("workspace feature not enabled");
|
||||
}
|
||||
}
|
||||
}
|
||||
Commands::Status => {
|
||||
#[cfg(feature = "workspace")]
|
||||
{
|
||||
let _name = Workspace::active().context("No active workspace")?;
|
||||
let ws = Workspace::open()?;
|
||||
|
||||
// Open store and get stats
|
||||
let store = Store::open(&ws.config.root_dir)?;
|
||||
let stats = store.get_stats()?;
|
||||
|
||||
if args.json {
|
||||
let output = WorkspaceOutput {
|
||||
name: ws.config.name.clone(),
|
||||
root_dir: ws.config.root_dir.clone(),
|
||||
total_documents: stats.total_documents,
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else {
|
||||
println!("Active workspace: {}", ws.config.name);
|
||||
println!("Root: {}", ws.config.root_dir);
|
||||
println!("Documents: {}", stats.total_documents);
|
||||
if stats.has_index {
|
||||
let index_info = stats.index_type.unwrap_or_else(|| "Unknown".to_string());
|
||||
println!("Index: Yes ({index_info})");
|
||||
} else {
|
||||
println!("Index: No");
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
if args.json {
|
||||
let error_output = ErrorOutput {
|
||||
error: "workspace feature not enabled".to_string(),
|
||||
error_type: "FeatureNotEnabled".to_string(),
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&error_output)?;
|
||||
eprintln!("{}", json_output);
|
||||
} else {
|
||||
println!("workspace feature not enabled");
|
||||
}
|
||||
}
|
||||
}
|
||||
Commands::Prune {} => {
|
||||
#[cfg(feature = "workspace")]
|
||||
{
|
||||
let _name = Workspace::active().context("No active workspace")?;
|
||||
let ws = Workspace::open()?;
|
||||
let store = Store::open(&ws.config.root_dir)?;
|
||||
|
||||
// Get all document paths from the workspace
|
||||
let all_paths = store.get_all_document_paths()?;
|
||||
let total_before = all_paths.len();
|
||||
|
||||
// Check which files no longer exist
|
||||
let mut missing_paths = Vec::new();
|
||||
for path in &all_paths {
|
||||
if !std::path::Path::new(path).exists() {
|
||||
missing_paths.push(path.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let files_removed = missing_paths.len();
|
||||
let files_remaining = total_before - files_removed;
|
||||
|
||||
if !missing_paths.is_empty() {
|
||||
// Remove stale documents
|
||||
store.delete_documents(&missing_paths)?;
|
||||
}
|
||||
|
||||
if args.json {
|
||||
let output = PruneOutput {
|
||||
files_removed,
|
||||
files_remaining,
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else if missing_paths.is_empty() {
|
||||
println!("No stale documents found. Workspace is clean.");
|
||||
} else {
|
||||
println!("Found {} stale documents:", missing_paths.len());
|
||||
for path in &missing_paths {
|
||||
println!(" - {path}");
|
||||
}
|
||||
println!(
|
||||
"Removed {} stale documents from workspace.",
|
||||
missing_paths.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
if args.json {
|
||||
let error_output = ErrorOutput {
|
||||
error: "workspace feature not enabled".to_string(),
|
||||
error_type: "FeatureNotEnabled".to_string(),
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&error_output)?;
|
||||
eprintln!("{}", json_output);
|
||||
} else {
|
||||
println!("workspace feature not enabled");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,51 +1,15 @@
|
||||
use anyhow::Result;
|
||||
use async_openai::Client;
|
||||
use async_openai::config::OpenAIConfig;
|
||||
use clap::Parser;
|
||||
use model2vec_rs::model::StaticModel;
|
||||
use std::io::{self, BufRead, IsTerminal};
|
||||
|
||||
use semtools::SemtoolsConfig;
|
||||
use semtools::ask::chat_agent::{ask_agent, ask_agent_with_stdin};
|
||||
use semtools::ask::responses_agent::{ask_agent_responses, ask_agent_responses_with_stdin};
|
||||
use semtools::config::ApiMode;
|
||||
use semtools::json_mode::ErrorOutput;
|
||||
use semtools::search::MODEL_NAME;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about = "A CLI tool for fast semantic keyword search", long_about = None)]
|
||||
struct Args {
|
||||
/// Query to prompt the agent with
|
||||
query: String,
|
||||
|
||||
/// Files to search (positional arguments, optional if using stdin)
|
||||
#[arg(help = "Files to search, optional if using stdin")]
|
||||
files: Vec<String>,
|
||||
|
||||
/// Path to the config file. Defaults to ~/.semtools_config.json
|
||||
#[clap(short = 'c', long)]
|
||||
config: Option<String>,
|
||||
|
||||
/// OpenAI API key (overrides config file and env var)
|
||||
#[clap(long)]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// OpenAI base URL (overrides config file)
|
||||
#[clap(long)]
|
||||
base_url: Option<String>,
|
||||
|
||||
/// Model to use for the agent (overrides config file)
|
||||
#[clap(short, long)]
|
||||
model: Option<String>,
|
||||
|
||||
/// API mode to use: 'chat' or 'responses' (overrides config file)
|
||||
#[clap(long)]
|
||||
api_mode: Option<String>,
|
||||
|
||||
/// Output results in JSON or text format
|
||||
#[clap(short, long)]
|
||||
json: bool,
|
||||
}
|
||||
use crate::SemtoolsConfig;
|
||||
use crate::ask::chat_agent::{ask_agent, ask_agent_with_stdin};
|
||||
use crate::ask::responses_agent::{ask_agent_responses, ask_agent_responses_with_stdin};
|
||||
use crate::config::ApiMode;
|
||||
use crate::json_mode::ErrorOutput;
|
||||
use crate::search::MODEL_NAME;
|
||||
|
||||
fn read_from_stdin() -> Result<Vec<String>> {
|
||||
let stdin = io::stdin();
|
||||
@@ -53,20 +17,24 @@ fn read_from_stdin() -> Result<Vec<String>> {
|
||||
Ok(lines?)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn ask_cmd(
|
||||
query: String,
|
||||
files: Vec<String>,
|
||||
config: Option<String>,
|
||||
api_key: Option<String>,
|
||||
base_url: Option<String>,
|
||||
model: Option<String>,
|
||||
api_mode: Option<String>,
|
||||
json: bool,
|
||||
) -> Result<()> {
|
||||
// Load configuration
|
||||
let config_path = args
|
||||
.config
|
||||
.unwrap_or_else(SemtoolsConfig::default_config_path);
|
||||
let config_path = config.unwrap_or_else(SemtoolsConfig::default_config_path);
|
||||
let semtools_config = SemtoolsConfig::from_config_file(&config_path)?;
|
||||
let ask_config = semtools_config.ask.unwrap_or_default();
|
||||
|
||||
// Resolve API key with priority: CLI arg > config file > env var > error
|
||||
let api_key = args
|
||||
.api_key
|
||||
let api_key = api_key
|
||||
.or(ask_config.api_key)
|
||||
.or_else(|| std::env::var("OPENAI_API_KEY").ok())
|
||||
.ok_or_else(|| {
|
||||
@@ -76,11 +44,10 @@ async fn main() -> Result<()> {
|
||||
})?;
|
||||
|
||||
// Resolve base URL with priority: CLI arg > config file > default
|
||||
let base_url = args.base_url.or(ask_config.base_url);
|
||||
let base_url = base_url.or(ask_config.base_url);
|
||||
|
||||
// Resolve model with priority: CLI arg > config file > default
|
||||
let model_name = args
|
||||
.model
|
||||
let model_name = model
|
||||
.or(ask_config.model)
|
||||
.unwrap_or_else(|| "gpt-4o-mini".to_string());
|
||||
|
||||
@@ -88,7 +55,7 @@ async fn main() -> Result<()> {
|
||||
let max_iterations = ask_config.max_iterations;
|
||||
|
||||
// Resolve API mode with priority: CLI arg > config file > default
|
||||
let api_mode = if let Some(mode_str) = args.api_mode {
|
||||
let api_mode = if let Some(mode_str) = api_mode {
|
||||
match mode_str.to_lowercase().as_str() {
|
||||
"chat" => ApiMode::Chat,
|
||||
"responses" => ApiMode::Responses,
|
||||
@@ -111,7 +78,7 @@ async fn main() -> Result<()> {
|
||||
let client = Client::with_config(openai_config);
|
||||
|
||||
// Check if we have stdin input (no files and stdin is not a terminal)
|
||||
if args.files.is_empty() && !io::stdin().is_terminal() {
|
||||
if files.is_empty() && !io::stdin().is_terminal() {
|
||||
let stdin_lines = read_from_stdin()?;
|
||||
if !stdin_lines.is_empty() {
|
||||
let stdin_content = stdin_lines.join("\n");
|
||||
@@ -119,20 +86,15 @@ async fn main() -> Result<()> {
|
||||
// Run the appropriate agent with stdin content (no tools)
|
||||
let output = match api_mode {
|
||||
ApiMode::Chat => {
|
||||
ask_agent_with_stdin(&stdin_content, &args.query, &client, &model_name).await?
|
||||
ask_agent_with_stdin(&stdin_content, &query, &client, &model_name).await?
|
||||
}
|
||||
ApiMode::Responses => {
|
||||
ask_agent_responses_with_stdin(
|
||||
&stdin_content,
|
||||
&args.query,
|
||||
&client,
|
||||
&model_name,
|
||||
)
|
||||
.await?
|
||||
ask_agent_responses_with_stdin(&stdin_content, &query, &client, &model_name)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
if args.json {
|
||||
if json {
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("\n{}", json_output);
|
||||
} else {
|
||||
@@ -144,10 +106,10 @@ async fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
// If no stdin, we need files to search through
|
||||
if args.files.is_empty() {
|
||||
if files.is_empty() {
|
||||
let error_msg =
|
||||
"No input provided. Either specify files as arguments or pipe input to stdin.";
|
||||
if args.json {
|
||||
if json {
|
||||
let error_output = ErrorOutput {
|
||||
error: error_msg.to_string(),
|
||||
error_type: "NoInput".to_string(),
|
||||
@@ -172,30 +134,14 @@ async fn main() -> Result<()> {
|
||||
// Run the appropriate agent based on API mode
|
||||
let output = match api_mode {
|
||||
ApiMode::Chat => {
|
||||
ask_agent(
|
||||
args.files,
|
||||
&args.query,
|
||||
&model,
|
||||
&client,
|
||||
&model_name,
|
||||
max_iterations,
|
||||
)
|
||||
.await?
|
||||
ask_agent(files, &query, &model, &client, &model_name, max_iterations).await?
|
||||
}
|
||||
ApiMode::Responses => {
|
||||
ask_agent_responses(
|
||||
args.files,
|
||||
&args.query,
|
||||
&model,
|
||||
&client,
|
||||
&model_name,
|
||||
max_iterations,
|
||||
)
|
||||
.await?
|
||||
ask_agent_responses(files, &query, &model, &client, &model_name, max_iterations).await?
|
||||
}
|
||||
};
|
||||
|
||||
if args.json {
|
||||
if json {
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("\n{}", json_output);
|
||||
} else {
|
||||
@@ -0,0 +1,11 @@
|
||||
#[cfg(feature = "ask")]
|
||||
pub mod ask;
|
||||
|
||||
#[cfg(feature = "parse")]
|
||||
pub mod parse;
|
||||
|
||||
#[cfg(feature = "search")]
|
||||
pub mod search;
|
||||
|
||||
#[cfg(feature = "workspace")]
|
||||
pub mod workspace;
|
||||
@@ -0,0 +1,47 @@
|
||||
use anyhow::Result;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::{LlamaParseBackend, SemtoolsConfig};
|
||||
|
||||
pub async fn parse_cmd(
|
||||
config: Option<String>,
|
||||
backend: String,
|
||||
files: Vec<String>,
|
||||
verbose: bool,
|
||||
) -> Result<()> {
|
||||
// Get config file path
|
||||
let config_path = config.unwrap_or_else(SemtoolsConfig::default_config_path);
|
||||
|
||||
// Load configuration
|
||||
let semtools_config = SemtoolsConfig::from_config_file(&config_path)?;
|
||||
let parse_config = semtools_config.parse.unwrap_or_default();
|
||||
|
||||
// Validate that files exist
|
||||
for file in &files {
|
||||
if !Path::new(file).exists() {
|
||||
eprintln!("Warning: File does not exist: {file}");
|
||||
}
|
||||
}
|
||||
|
||||
// Create backend and process files
|
||||
match backend.as_str() {
|
||||
"llama-parse" => {
|
||||
let backend = LlamaParseBackend::new(parse_config, verbose)?;
|
||||
let results = backend.parse(files).await?;
|
||||
|
||||
// Output the paths to parsed files, one per line
|
||||
for result_path in results {
|
||||
println!("{result_path}");
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
eprintln!(
|
||||
"Error: Unknown backend '{}'. Supported backends: llama-parse",
|
||||
backend
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,50 +1,18 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use model2vec_rs::model::StaticModel;
|
||||
use std::io::{self, BufRead, IsTerminal};
|
||||
|
||||
#[cfg(feature = "workspace")]
|
||||
use semtools::workspace::{Workspace, store::RankedLine};
|
||||
use crate::workspace::{Workspace, store::RankedLine};
|
||||
|
||||
#[cfg(feature = "workspace")]
|
||||
use semtools::search::search_with_workspace;
|
||||
use crate::search::search_with_workspace;
|
||||
|
||||
use semtools::json_mode::{ErrorOutput, SearchOutput, SearchResultJSON};
|
||||
use semtools::search::{
|
||||
use crate::json_mode::{ErrorOutput, SearchOutput, SearchResultJSON};
|
||||
use crate::search::{
|
||||
Document, MODEL_NAME, SearchConfig, SearchResult, search_documents, search_files,
|
||||
};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about = "A CLI tool for fast semantic keyword search", long_about = None)]
|
||||
struct Args {
|
||||
/// Query to search for (positional argument)
|
||||
query: String,
|
||||
|
||||
/// Files to search (positional arguments, optional if using stdin)
|
||||
#[arg(help = "Files to search, optional if using stdin")]
|
||||
files: Vec<String>,
|
||||
|
||||
/// How many lines before/after to return as context
|
||||
#[arg(short = 'n', long = "n-lines", alias = "context", default_value_t = 3)]
|
||||
n_lines: usize,
|
||||
|
||||
/// The top-k files or texts to return (ignored if max_distance is set)
|
||||
#[arg(long, default_value_t = 3)]
|
||||
top_k: usize,
|
||||
|
||||
/// Return all results with distance below this threshold (0.0+)
|
||||
#[arg(short = 'm', long = "max-distance", alias = "threshold")]
|
||||
max_distance: Option<f64>,
|
||||
|
||||
/// Perform case-insensitive search (default is false)
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
ignore_case: bool,
|
||||
|
||||
/// Output results in JSON format
|
||||
#[clap(short, long)]
|
||||
json: bool,
|
||||
}
|
||||
|
||||
fn read_from_stdin() -> Result<Vec<String>> {
|
||||
let stdin = io::stdin();
|
||||
let lines: Result<Vec<String>, _> = stdin.lock().lines().collect();
|
||||
@@ -141,10 +109,15 @@ fn print_workspace_search_results(ranked_lines: &[RankedLine], n_lines: usize) {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
pub async fn search_cmd(
|
||||
query: String,
|
||||
files: Vec<String>,
|
||||
n_lines: usize,
|
||||
top_k: usize,
|
||||
max_distance: Option<f64>,
|
||||
ignore_case: bool,
|
||||
json: bool,
|
||||
) -> Result<()> {
|
||||
let model = StaticModel::from_pretrained(
|
||||
MODEL_NAME, // "minishlab/potion-multilingual-128M",
|
||||
None, // Optional: Hugging Face API token for private models
|
||||
@@ -152,25 +125,25 @@ async fn main() -> Result<()> {
|
||||
None, // Optional: subfolder if model files are not at the root of the repo/path
|
||||
)?;
|
||||
|
||||
let query = if args.ignore_case {
|
||||
args.query.to_lowercase()
|
||||
let query = if ignore_case {
|
||||
query.to_lowercase()
|
||||
} else {
|
||||
args.query.clone()
|
||||
query.clone()
|
||||
};
|
||||
|
||||
let query_embedding = model.encode_single(&query);
|
||||
let config = SearchConfig {
|
||||
n_lines: args.n_lines,
|
||||
top_k: args.top_k,
|
||||
max_distance: args.max_distance,
|
||||
ignore_case: args.ignore_case,
|
||||
n_lines,
|
||||
top_k,
|
||||
max_distance,
|
||||
ignore_case,
|
||||
};
|
||||
|
||||
// Handle stdin input (non-workspace mode)
|
||||
if args.files.is_empty() && !io::stdin().is_terminal() {
|
||||
if files.is_empty() && !io::stdin().is_terminal() {
|
||||
let stdin_lines = read_from_stdin()?;
|
||||
if !stdin_lines.is_empty() {
|
||||
let lines_for_embedding = if args.ignore_case {
|
||||
let lines_for_embedding = if ignore_case {
|
||||
stdin_lines.iter().map(|s| s.to_lowercase()).collect()
|
||||
} else {
|
||||
stdin_lines.clone()
|
||||
@@ -186,7 +159,7 @@ async fn main() -> Result<()> {
|
||||
|
||||
let search_results = search_documents(&documents, &query_embedding, &config);
|
||||
|
||||
if args.json {
|
||||
if json {
|
||||
let output = SearchOutput {
|
||||
results: search_results.iter().map(search_result_to_json).collect(),
|
||||
};
|
||||
@@ -200,10 +173,10 @@ async fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
if args.files.is_empty() {
|
||||
if files.is_empty() {
|
||||
let error_msg =
|
||||
"No input provided. Either specify files as arguments or pipe input to stdin.";
|
||||
if args.json {
|
||||
if json {
|
||||
let error_output = ErrorOutput {
|
||||
error: error_msg.to_string(),
|
||||
error_type: "NoInput".to_string(),
|
||||
@@ -222,21 +195,21 @@ async fn main() -> Result<()> {
|
||||
if Workspace::active().is_ok() {
|
||||
// Workspace mode: use persisted line embeddings for speed
|
||||
let config = SearchConfig {
|
||||
n_lines: args.n_lines,
|
||||
top_k: args.top_k,
|
||||
max_distance: args.max_distance,
|
||||
ignore_case: args.ignore_case,
|
||||
n_lines,
|
||||
top_k,
|
||||
max_distance,
|
||||
ignore_case,
|
||||
};
|
||||
let ranked_lines = search_with_workspace(&args.files, &query, &model, &config).await?;
|
||||
let ranked_lines = search_with_workspace(&files, &query, &model, &config).await?;
|
||||
|
||||
if args.json {
|
||||
if json {
|
||||
// Convert workspace results to SearchResultJSON
|
||||
let results: Vec<SearchResultJSON> = ranked_lines
|
||||
.iter()
|
||||
.map(|ranked_line| {
|
||||
let match_line_number = ranked_line.line_number as usize;
|
||||
let start = match_line_number.saturating_sub(args.n_lines);
|
||||
let end = match_line_number + args.n_lines + 1;
|
||||
let start = match_line_number.saturating_sub(n_lines);
|
||||
let end = match_line_number + n_lines + 1;
|
||||
|
||||
// Read file content for the result
|
||||
let content =
|
||||
@@ -264,12 +237,12 @@ async fn main() -> Result<()> {
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else {
|
||||
print_workspace_search_results(&ranked_lines, args.n_lines);
|
||||
print_workspace_search_results(&ranked_lines, n_lines);
|
||||
}
|
||||
} else {
|
||||
let search_results = search_files(&args.files, &query, &model, &config)?;
|
||||
let search_results = search_files(&files, &query, &model, &config)?;
|
||||
|
||||
if args.json {
|
||||
if json {
|
||||
let output = SearchOutput {
|
||||
results: search_results.iter().map(search_result_to_json).collect(),
|
||||
};
|
||||
@@ -283,9 +256,9 @@ async fn main() -> Result<()> {
|
||||
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
let search_results = search_files(&args.files, &query, &model, &config)?;
|
||||
let search_results = search_files(&files, &query, &model, &config)?;
|
||||
|
||||
if args.json {
|
||||
if json {
|
||||
let output = SearchOutput {
|
||||
results: search_results.iter().map(search_result_to_json).collect(),
|
||||
};
|
||||
@@ -0,0 +1,174 @@
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
#[cfg(feature = "workspace")]
|
||||
use crate::workspace::{Workspace, WorkspaceConfig, store::Store};
|
||||
|
||||
use crate::json_mode::{PruneOutput, WorkspaceOutput};
|
||||
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
use crate::json_mode::ErrorOutput;
|
||||
|
||||
pub async fn workspace_use_cmd(name: String, json: bool) -> Result<()> {
|
||||
#[cfg(feature = "workspace")]
|
||||
{
|
||||
// Initialize new workspace configuration
|
||||
let ws = Workspace {
|
||||
config: WorkspaceConfig {
|
||||
name: name.clone(),
|
||||
root_dir: Workspace::root_path(&name)?,
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
ws.save()?;
|
||||
|
||||
if json {
|
||||
// Try to get document count from store, or use 0 for new workspace
|
||||
let total_documents = if let Ok(store) = Store::open(&ws.config.root_dir) {
|
||||
if let Ok(stats) = store.get_stats() {
|
||||
stats.total_documents
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let output = WorkspaceOutput {
|
||||
name: ws.config.name.clone(),
|
||||
root_dir: ws.config.root_dir.clone(),
|
||||
total_documents,
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else {
|
||||
println!("Workspace '{name}' configured.");
|
||||
println!("To activate it, run:");
|
||||
println!(" export SEMTOOLS_WORKSPACE={name}");
|
||||
println!();
|
||||
println!("Or add this to your shell profile (.bashrc, .zshrc, etc.)");
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
if json {
|
||||
let error_output = ErrorOutput {
|
||||
error: "workspace feature not enabled".to_string(),
|
||||
error_type: "FeatureNotEnabled".to_string(),
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&error_output)?;
|
||||
eprintln!("{}", json_output);
|
||||
} else {
|
||||
println!("workspace feature not enabled");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn workspace_status_cmd(json: bool) -> Result<()> {
|
||||
#[cfg(feature = "workspace")]
|
||||
{
|
||||
let _name = Workspace::active().context("No active workspace")?;
|
||||
let ws = Workspace::open()?;
|
||||
|
||||
// Open store and get stats
|
||||
let store = Store::open(&ws.config.root_dir)?;
|
||||
let stats = store.get_stats()?;
|
||||
|
||||
if json {
|
||||
let output = WorkspaceOutput {
|
||||
name: ws.config.name.clone(),
|
||||
root_dir: ws.config.root_dir.clone(),
|
||||
total_documents: stats.total_documents,
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else {
|
||||
println!("Active workspace: {}", ws.config.name);
|
||||
println!("Root: {}", ws.config.root_dir);
|
||||
println!("Documents: {}", stats.total_documents);
|
||||
if stats.has_index {
|
||||
let index_info = stats.index_type.unwrap_or_else(|| "Unknown".to_string());
|
||||
println!("Index: Yes ({index_info})");
|
||||
} else {
|
||||
println!("Index: No");
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
if json {
|
||||
let error_output = ErrorOutput {
|
||||
error: "workspace feature not enabled".to_string(),
|
||||
error_type: "FeatureNotEnabled".to_string(),
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&error_output)?;
|
||||
eprintln!("{}", json_output);
|
||||
} else {
|
||||
println!("workspace feature not enabled");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn workspace_prune_cmd(json: bool) -> Result<()> {
|
||||
#[cfg(feature = "workspace")]
|
||||
{
|
||||
let _name = Workspace::active().context("No active workspace")?;
|
||||
let ws = Workspace::open()?;
|
||||
let store = Store::open(&ws.config.root_dir)?;
|
||||
|
||||
// Get all document paths from the workspace
|
||||
let all_paths = store.get_all_document_paths()?;
|
||||
let total_before = all_paths.len();
|
||||
|
||||
// Check which files no longer exist
|
||||
let mut missing_paths = Vec::new();
|
||||
for path in &all_paths {
|
||||
if !std::path::Path::new(path).exists() {
|
||||
missing_paths.push(path.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let files_removed = missing_paths.len();
|
||||
let files_remaining = total_before - files_removed;
|
||||
|
||||
if !missing_paths.is_empty() {
|
||||
// Remove stale documents
|
||||
store.delete_documents(&missing_paths)?;
|
||||
}
|
||||
|
||||
if json {
|
||||
let output = PruneOutput {
|
||||
files_removed,
|
||||
files_remaining,
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&output)?;
|
||||
println!("{}", json_output);
|
||||
} else if missing_paths.is_empty() {
|
||||
println!("No stale documents found. Workspace is clean.");
|
||||
} else {
|
||||
println!("Found {} stale documents:", missing_paths.len());
|
||||
for path in &missing_paths {
|
||||
println!(" - {path}");
|
||||
}
|
||||
println!(
|
||||
"Removed {} stale documents from workspace.",
|
||||
missing_paths.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "workspace"))]
|
||||
{
|
||||
if json {
|
||||
let error_output = ErrorOutput {
|
||||
error: "workspace feature not enabled".to_string(),
|
||||
error_type: "FeatureNotEnabled".to_string(),
|
||||
};
|
||||
let json_output = serde_json::to_string_pretty(&error_output)?;
|
||||
eprintln!("{}", json_output);
|
||||
} else {
|
||||
println!("workspace feature not enabled");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -3,6 +3,7 @@
|
||||
pub mod config;
|
||||
pub use config::{AskConfig, SemtoolsConfig};
|
||||
|
||||
pub mod cmds;
|
||||
pub mod json_mode;
|
||||
|
||||
#[cfg(feature = "parse")]
|
||||
|
||||
Reference in New Issue
Block a user