Update to a new version with better boundaries.

XMLHttpRequest now XML|Http|Request instead of XMLH|ttp|Request.
2024-11-23 07:10:27 +00:00 · 2017-03-27 16:32:18 -07:00 · 2017-03-27 16:32:18 -07:00 · f9b413746f
commit f9b413746f
parent 8af174b8bb
9 changed files with 128 additions and 81 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 authors = ["Without Boats <woboats@gmail.com>"]
 name = "heck"
-version = "0.1.0"
+version = "0.2.0"
 license = "MIT OR Apache-2.0"
 description = "heck is a case conversion library."
 homepage = "https://github.com/withoutboats/heck"
--- a/README.md
+++ b/README.md
@ -12,13 +12,14 @@ Word boundaries are defined as the "unicode words" defined in the
 `unicode_segmentation` library, as well as within those words in this manner:

 1. All underscore characters are considered word boundaries.
-2. A single uppercase letter (followed by no letters or by lowercase letters)
-is considered to be just after a word boundary.
-3. Multiple consecutive uppercase letters are considered to be between two
-word boundaries.
+2. If an uppercase character is followed by lowercase letters, a word boundary
+is considered to be just prior to that uppercase character.
+3. If multiple uppercase characters are consecutive, they are considered to be
+within a single word, except that the last will be part of the next word if it
+is followed by lowercase characters (see rule 2).

-That is, "HelloWorld" is segmented "Hello World" whereas "HELLOworld" is
-segmented "HELLO world."
+That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is
+segmented `XML|Http|Request`.

 Characters not within words (such as spaces, punctuations, and underscores)
 are not included in the output string except as they are a part of the case
--- a/src/camel.rs
+++ b/src/camel.rs
@ -22,13 +22,7 @@ pub trait CamelCase: ToOwned {

 impl CamelCase for str {
    fn to_camel_case(&self) -> String {
-        ::transform(self, |c, s| s.extend(c.to_uppercase()), |c, s| {
-            if s.len() == 0 {
-                s.extend(c.to_uppercase())
-            } else {
-                s.extend(c.to_lowercase())
-            }
-        })
+        ::transform(self, ::capitalize, |_| {})
    }
 }

@ -52,6 +46,7 @@ mod tests {
    t!(test5: "kebab-case" => "KebabCase");
    t!(test6: "SHOUTY_SNAKE_CASE" => "ShoutySnakeCase");
    t!(test7: "snake_case" => "SnakeCase");
-    t!(test8: "this-contains_ ALLkinds OfWord_Boundaries" => "ThisContainsAllKindsOfWordBoundaries");
-    // TODO unicode tests
+    t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "ThisContainsAllKindsOfWordBoundaries");
+    t!(test9: "XΣXΣ baﬄe" => "XσxςBaﬄe");
+    t!(test10: "XMLHttpRequest" => "XmlHttpRequest");
 }
--- a/src/kebab.rs
+++ b/src/kebab.rs
@ -21,10 +21,7 @@ pub trait KebabCase: ToOwned {

 impl KebabCase for str {
    fn to_kebab_case(&self) -> Self::Owned {
-        ::transform(self, |c, s| {
-            s.push('-');
-            s.extend(c.to_lowercase())
-        }, |c, s| s.extend(c.to_lowercase()))
+        ::transform(self, ::lowercase, |s| s.push('-'))
    }
 }

@ -48,5 +45,7 @@ mod tests {
    t!(test5: "kebab-case" => "kebab-case");
    t!(test6: "SHOUTY_SNAKE_CASE" => "shouty-snake-case");
    t!(test7: "snake_case" => "snake-case");
-    t!(test8: "this-contains_ ALLkinds OfWord_Boundaries" => "this-contains-all-kinds-of-word-boundaries");
+    t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "this-contains-all-kinds-of-word-boundaries");
+    t!(test9: "XΣXΣ baﬄe" => "xσxς-baﬄe");
+    t!(test10: "XMLHttpRequest" => "xml-http-request");
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -10,13 +10,21 @@
 //! `unicode_segmentation` library, as well as within those words in this manner:
 //! 
 //! 1. All underscore characters are considered word boundaries.
-//! 2. A single uppercase letter (followed by no letters or by lowercase letters)
-//! is considered to be just after a word boundary.
-//! 3. Multiple consecutive uppercase letters are considered to be between two
-//! word boundaries.
+//! 2. If an uppercase character is followed by lowercase letters, a word boundary
+//! is considered to be just prior to that uppercase character.
+//! 3. If multiple uppercase characters are consecutive, they are considered to be
+//! within a single word, except that the last will be part of the next word if it
+//! is followed by lowercase characters (see rule 2).
 //! 
-//! That is, "HelloWorld" is segmented "Hello World" whereas "HELLOworld" is
-//! segmented "HELLO world."
+//! That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is
+//! segmented `XML|Http|Request`.
+//! 
+//! Characters not within words (such as spaces, punctuations, and underscores)
+//! are not included in the output string except as they are a part of the case
+//! being converted to. Multiple adjacent word boundaries (such as a series of
+//! underscores) are folded into one. ("hello__world" in snake case is therefore
+//! "hello_world", not the exact same string). Leading or trailing word boundary
+//! indicators are dropped, except insofar as CamelCase capitalizes the first word.
 //! 
 //! ### Cases contained in this library:
 //! 
@ -45,50 +53,101 @@ pub use title::TitleCase;

 use unicode_segmentation::UnicodeSegmentation;

-fn transform<F, G>(s: &str, word_boundary: F, not_word_boundary: G) -> String
+fn transform<F, G>(s: &str, with_word: F, boundary: G) -> String
 where
-    F: Fn(char, &mut String),
-    G: Fn(char, &mut String),
+    F: Fn(&str, &mut String),
+    G: Fn(&mut String)
 {
+    macro_rules! apply {
+        ($s:ident [ $init:ident .. $next:ident ], $out:ident, $boundary:ident, $with_word:ident, $first_word:ident) => {
+            if !$first_word {
+                $boundary(&mut $out);
+            }
+            $with_word(&$s[$init..$next], &mut $out);
+            $init = $next_i;
+        };
+    }
+    
    let mut out = String::new();
-    let mut after_word_boundary = false;
+    let mut first_word = true;

    for word in s.unicode_words() {
-        if out.len() != 0 { after_word_boundary = true; }
-        let mut last_c_was_uppercase = false;
-        let mut multiple_uppercase = false;
+        let mut char_indices = word.char_indices().peekable();
+        let mut init = 0;
+        let mut previous_is_uppercase = false;

-        for c in word.chars() {
+        while let Some((i, c)) = char_indices.next() {
+            // Skip underscore characters
            if c == '_' {
-                after_word_boundary = true;
+                if init == i { init += 1; }
                continue
            }

-            if c.is_uppercase() {
-                if out.len() != 0 && !last_c_was_uppercase {
-                    after_word_boundary = true;
+            match char_indices.peek() {
+                Some(&(next_i, next)) if next == '_' => {
+                    if !first_word { boundary(&mut out); }
+                    with_word(&word[init..next_i], &mut out);
+                    first_word = false;
+                    init = next_i;
+                    previous_is_uppercase = c.is_uppercase();
                }

-                if last_c_was_uppercase {
-                    multiple_uppercase = true;
-                }
-                last_c_was_uppercase = true;
-            } else {
-                if multiple_uppercase && !after_word_boundary {
-                    after_word_boundary = true;
+                Some(&(_, next)) if c.is_uppercase() => {
+                    if next.is_lowercase() && previous_is_uppercase {
+                        if !first_word { boundary(&mut out); }
+                        with_word(&word[init..i], &mut out);
+                        first_word = false;
+                        init = i;
+                    }
+                    previous_is_uppercase = true;
                }

-                multiple_uppercase = false;
-                last_c_was_uppercase = false;
+                Some(&(next_i, next)) => {
+                    if next.is_uppercase() {
+                        if !first_word { boundary(&mut out); }
+                        with_word(&word[init..next_i], &mut out);
+                        first_word = false;
+                        init = next_i;
+                    }
+                    previous_is_uppercase = false;
+                }
+
+                None => {
+                    if !first_word { boundary(&mut out); }
+                    with_word(&word[init..], &mut out);
+                    first_word = false;
+                    break;
+                }
            }
-            if after_word_boundary {
-                word_boundary(c, &mut out);
-            } else {
-                not_word_boundary(c, &mut out);
-            }
-            after_word_boundary = false;
        }
    }

    out
 }
+
+fn lowercase(s: &str, out: &mut String) {
+    let mut chars = s.chars().peekable();
+    while let Some(c) = chars.next() {
+        if c == 'Σ' && chars.peek().is_none() {
+            out.push('ς');
+        } else {
+            out.extend(c.to_lowercase());
+        }
+    }
+}
+
+fn uppercase(s: &str, out: &mut String ) {
+    for c in s.chars() {
+        out.extend(c.to_uppercase())
+    }
+}
+
+fn capitalize(s: &str, out: &mut String) {
+    let mut char_indices = s.char_indices();
+    if let Some((_, c)) = char_indices.next() {
+        out.extend(c.to_uppercase());
+        if let Some((i, _)) = char_indices.next() {
+            lowercase(&s[i..], out);
+        }
+    }
+}
--- a/src/mixed.rs
+++ b/src/mixed.rs
@ -22,7 +22,10 @@ pub trait MixedCase: ToOwned {

 impl MixedCase for str {
    fn to_mixed_case(&self) -> String {
-        ::transform(self, |c, s| s.extend(c.to_uppercase()), |c, s| s.extend(c.to_lowercase()))
+        ::transform(self, |s, out| {
+            if out.is_empty() { ::lowercase(s, out); }
+            else { ::capitalize(s, out) }
+        }, |_| {})
    }
 }

@ -46,6 +49,8 @@ mod tests {
    t!(test5: "kebab-case" => "kebabCase");
    t!(test6: "SHOUTY_SNAKE_CASE" => "shoutySnakeCase");
    t!(test7: "snake_case" => "snakeCase");
-    t!(test8: "this-contains_ ALLkinds OfWord_Boundaries" => "thisContainsAllKindsOfWordBoundaries");
+    t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "thisContainsAllKindsOfWordBoundaries");
+    t!(test9: "XΣXΣ baﬄe" => "xσxςBaﬄe");
+    t!(test10: "XMLHttpRequest" => "xmlHttpRequest");
    // TODO unicode tests
 }
--- a/src/shouty_snake.rs
+++ b/src/shouty_snake.rs
@ -37,10 +37,7 @@ impl<T: ShoutySnakeCase> ShoutySnekCase for T {

 impl ShoutySnakeCase for str {
    fn to_shouty_snake_case(&self) -> Self::Owned {
-        ::transform(self, |c, s| {
-            s.push('_');
-            s.extend(c.to_uppercase())
-        }, |c, s| s.extend(c.to_uppercase()))
+        ::transform(self, ::uppercase, |s| s.push('_'))
    }
 }

@ -64,6 +61,7 @@ mod tests {
    t!(test5: "kebab-case" => "KEBAB_CASE");
    t!(test6: "SHOUTY_SNAKE_CASE" => "SHOUTY_SNAKE_CASE");
    t!(test7: "snake_case" => "SNAKE_CASE");
-    t!(test8: "this-contains_ ALLkinds OfWord_Boundaries" => "THIS_CONTAINS_ALL_KINDS_OF_WORD_BOUNDARIES");
-    // TODO unicode tests
+    t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "THIS_CONTAINS_ALL_KINDS_OF_WORD_BOUNDARIES");
+    t!(test9: "XΣXΣ baﬄe" => "XΣXΣ_BAFFLE");
+    t!(test10: "XMLHttpRequest" => "XML_HTTP_REQUEST");
 }
--- a/src/snake.rs
+++ b/src/snake.rs
@ -34,10 +34,7 @@ impl<T: SnakeCase> SnekCase for T {

 impl SnakeCase for str {
    fn to_snake_case(&self) -> String {
-        ::transform(self, |c, s| {
-            s.push('_');
-            s.extend(c.to_lowercase())
-        }, |c, s| s.extend(c.to_lowercase()))
+        ::transform(self, ::lowercase, |s| s.push('_'))
    }
 }

@ -56,11 +53,12 @@ mod tests {

    t!(test1: "CamelCase" => "camel_case");
    t!(test2: "This is Human case." => "this_is_human_case");
-    t!(test3: "MixedUp CamelCase, with some Spaces" => "mixed_up_camel_case_with_some_spaces");
+    t!(test3: "MixedUP CamelCase, with some Spaces" => "mixed_up_camel_case_with_some_spaces");
    t!(test4: "mixed_up snake_case with some _spaces" => "mixed_up_snake_case_with_some_spaces");
    t!(test5: "kebab-case" => "kebab_case");
    t!(test6: "SHOUTY_SNAKE_CASE" => "shouty_snake_case");
    t!(test7: "snake_case" => "snake_case");
-    t!(test8: "this-contains_ ALLkinds OfWord_Boundaries" => "this_contains_all_kinds_of_word_boundaries");
-    // TODO unicode tests
+    t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "this_contains_all_kinds_of_word_boundaries");
+    t!(test9: "XΣXΣ baﬄe" => "xσxς_baﬄe");
+    t!(test10: "XMLHttpRequest" => "xml_http_request");
 }
--- a/src/title.rs
+++ b/src/title.rs
@ -22,16 +22,7 @@ pub trait TitleCase: ToOwned {

 impl TitleCase for str {
    fn to_title_case(&self) -> String {
-        ::transform(self, |c, s| {
-            s.push(' ');
-            s.extend(c.to_uppercase())
-        }, |c, s| {
-            if s.len() == 0 {
-                s.extend(c.to_uppercase())
-            } else {
-                s.extend(c.to_lowercase())
-            }
-        })
+        ::transform(self, ::capitalize, |s| s.push(' '))
    }
 }

@ -55,6 +46,7 @@ mod tests {
    t!(test5: "kebab-case" => "Kebab Case");
    t!(test6: "SHOUTY_SNAKE_CASE" => "Shouty Snake Case");
    t!(test7: "snake_case" => "Snake Case");
-    t!(test8: "this-contains_ ALLkinds OfWord_Boundaries" => "This Contains All Kinds Of Word Boundaries");
-    // TODO unicode tests
+    t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "This Contains All Kinds Of Word Boundaries");
+    t!(test9: "XΣXΣ baﬄe" => "Xσxς Baﬄe");
+    t!(test10: "XMLHttpRequest" => "Xml Http Request");
 }