Cleanup example metadata parsing utilities(#20251) (#20252)

## Which issue does this PR close?  - Closes #https://github.com/apache/datafusion/issues/20251. ## Rationale for this change  ## What changes are included in this PR?  ## Are these changes tested?  ## Are there any user-facing changes?
2026-07-01 21:24:06 -04:00 · 2026-02-11 20:13:54 +04:00
parent 6b6f64d7fb
commit 17416bf92f
6 changed files with 56 additions and 36 deletions
@@ -18,9 +18,8 @@
 use std::path::{Path, PathBuf};

 use datafusion::dataframe::DataFrameWriteOptions;
-use datafusion::error::Result;
+use datafusion::error::{DataFusionError, Result};
 use datafusion::prelude::{CsvReadOptions, SessionContext};
-use datafusion_common::DataFusionError;
 use tempfile::TempDir;
 use tokio::fs::create_dir_all;

@@ -18,8 +18,7 @@
 use std::path::PathBuf;

 use arrow_schema::SchemaRef;
-use datafusion::error::Result;
-use datafusion_common::DataFusionError;
+use datafusion::error::{DataFusionError, Result};

 pub mod cars;
 pub mod regex;
@@ -50,10 +49,11 @@ impl ExampleDataset {
    }

    pub fn path_str(&self) -> Result<String> {
-        self.path().to_str().map(String::from).ok_or_else(|| {
+        let path = self.path();
+        path.to_str().map(String::from).ok_or_else(|| {
            DataFusionError::Execution(format!(
                "CSV directory path is not valid UTF-8: {}",
-                self.path().display()
+                path.display()
            ))
        })
    }
@@ -20,10 +20,12 @@
 //! An example group is defined as a directory containing a `main.rs` file
 //! under the examples root. This module is intentionally filesystem-focused
 //! and does not perform any parsing or rendering.
+//! Discovery fails if no valid example groups are found.

 use std::fs;
 use std::path::{Path, PathBuf};

+use datafusion::common::exec_err;
 use datafusion::error::Result;

 /// Discovers all example group directories under the given root.
@@ -35,10 +37,15 @@ pub fn discover_example_groups(root: &Path) -> Result<Vec<PathBuf>> {
        let entry = entry?;
        let path = entry.path();

-        if path.is_dir() && path.join("main.rs").exists() {
+        if path.is_dir() && path.join("main.rs").is_file() {
            groups.push(path);
        }
    }
+
+    if groups.is_empty() {
+        return exec_err!("No example groups found under: {}", root.display());
+    }
+
    groups.sort();
    Ok(groups)
 }
@@ -47,6 +54,8 @@ pub fn discover_example_groups(root: &Path) -> Result<Vec<PathBuf>> {
 mod tests {
    use super::*;

+    use crate::utils::example_metadata::test_utils::assert_exec_err_contains;
+
    use std::fs::{self, File};

    use tempfile::TempDir;
@@ -66,10 +75,29 @@ mod tests {
        fs::create_dir(&group2)?;

        let groups = discover_example_groups(root)?;
-
        assert_eq!(groups.len(), 1);
        assert_eq!(groups[0], group1);
+        Ok(())
+    }

+    #[test]
+    fn discover_example_groups_errors_if_main_rs_is_a_directory() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let root = tmp.path();
+        let group = root.join("group");
+        fs::create_dir(&group)?;
+        fs::create_dir(group.join("main.rs"))?;
+
+        let err = discover_example_groups(root).unwrap_err();
+        assert_exec_err_contains(err, "No example groups found");
+        Ok(())
+    }
+
+    #[test]
+    fn discover_example_groups_errors_if_none_found() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let err = discover_example_groups(tmp.path()).unwrap_err();
+        assert_exec_err_contains(err, "No example groups found");
        Ok(())
    }
 }
@@ -25,7 +25,16 @@ use std::path::Path;

 use datafusion::error::{DataFusionError, Result};

-use crate::utils::example_metadata::{parse_main_rs_docs, render::ABBREVIATIONS};
+use crate::utils::example_metadata::parse_main_rs_docs;
+
+/// Well-known abbreviations used to preserve correct capitalization
+/// when generating human-readable documentation titles.
+const ABBREVIATIONS: &[(&str, &str)] = &[
+    ("dataframe", "DataFrame"),
+    ("io", "IO"),
+    ("sql", "SQL"),
+    ("udf", "UDF"),
+];

 /// A group of related examples (e.g. `builtin_functions`, `udf`).
 ///
@@ -21,15 +21,16 @@
 //! and their associated metadata (file name and description), enforcing
 //! a strict ordering and structure to avoid ambiguous documentation.

-use std::path::Path;
-use std::{collections::HashSet, fs};
+use std::{collections::HashSet, fs, path::Path};

-use datafusion_common::{DataFusionError, Result};
+use datafusion::common::exec_err;
+use datafusion::error::Result;
 use nom::{
-    IResult, Parser,
+    Err, IResult, Parser,
    bytes::complete::{tag, take_until, take_while},
    character::complete::multispace0,
    combinator::all_consuming,
+    error::{Error, ErrorKind},
    sequence::{delimited, preceded},
 };

@@ -77,19 +78,13 @@ fn parse_metadata_line(input: &str) -> IResult<&str, (&str, &str)> {
    let content = payload
        .strip_prefix("(")
        .and_then(|s| s.strip_suffix(")"))
-        .ok_or_else(|| {
-            nom::Err::Error(nom::error::Error::new(payload, nom::error::ErrorKind::Tag))
-        })?;
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?;

    let (file, desc) = content
        .strip_prefix("file:")
-        .ok_or_else(|| {
-            nom::Err::Error(nom::error::Error::new(payload, nom::error::ErrorKind::Tag))
-        })?
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?
        .split_once(", desc:")
-        .ok_or_else(|| {
-            nom::Err::Error(nom::error::Error::new(payload, nom::error::ErrorKind::Tag))
-        })?;
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?;

    Ok((rest, (file.trim(), desc.trim())))
 }
@@ -119,18 +114,16 @@ pub fn parse_main_rs_docs(path: &Path) -> Result<Vec<ExampleEntry>> {
            let subcommand = match state {
                ParserState::SeenSubcommand(s) => s,
                ParserState::Idle => {
-                    return Err(DataFusionError::Execution(format!(
+                    return exec_err!(
                        "Metadata without preceding subcommand at {}:{}",
                        path.display(),
                        line_no + 1
-                    )));
+                    );
                }
            };

            if !seen_subcommands.insert(subcommand) {
-                return Err(DataFusionError::Execution(format!(
-                    "Duplicate metadata for subcommand `{subcommand}`"
-                )));
+                return exec_err!("Duplicate metadata for subcommand `{subcommand}`");
            }

            entries.push(ExampleEntry {
@@ -85,15 +85,6 @@ cargo run --example dataframe -- dataframe
 ```
 "#;

-/// Well-known abbreviations used to preserve correct capitalization
-/// when generating human-readable documentation titles.
-pub const ABBREVIATIONS: &[(&str, &str)] = &[
-    ("dataframe", "DataFrame"),
-    ("io", "IO"),
-    ("sql", "SQL"),
-    ("udf", "UDF"),
-];
-
 /// Generates Markdown documentation for DataFusion examples.
 ///
 /// If `group` is `None`, documentation is generated for all example groups.