diff --git a/adv-simd.h b/adv-simd.h
index e86c12ba..54eec0f1 100644
--- a/adv-simd.h
+++ b/adv-simd.h
@@ -494,12 +494,13 @@ inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
 /// \tparam F1 function to process 1 128-bit block
 /// \tparam F4 function to process 4 128-bit blocks
 /// \tparam W word type of the subkey table
-/// \tparam V vector type of the NEON data type
-/// \details AdvancedProcessBlocks128_6x2_NEON processes 4 and 1 NEON SIMD words
+/// \tparam V vector type of the NEON datatype
+/// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
 ///   at a time.
 /// \details The subkey type is usually word32 or word64. V is the vector type and it is
 ///   usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
-///   vector type.
+///   vector type. The V parameter is used to avoid template argument
+///   deduction/substitution failures.
 template <typename F1, typename F4, typename W, typename V>
 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
             const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks,
diff --git a/cham-simd.cpp b/cham-simd.cpp
index a503aa4a..5b9f2e85 100644
--- a/cham-simd.cpp
+++ b/cham-simd.cpp
@@ -83,7 +83,7 @@ inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpacklo_epi16(a, b);
     const __m128i r2 = _mm_unpacklo_epi16(c, d);
@@ -102,7 +102,7 @@ inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpacklo_epi16(a, b);
     const __m128i r2 = _mm_unpacklo_epi16(c, d);
@@ -121,7 +121,7 @@ inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpacklo_epi16(a, b);
     const __m128i r2 = _mm_unpacklo_epi16(c, d);
@@ -140,7 +140,7 @@ inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpacklo_epi16(a, b);
     const __m128i r2 = _mm_unpacklo_epi16(c, d);
@@ -159,7 +159,7 @@ inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpackhi_epi16(a, b);
     const __m128i r2 = _mm_unpackhi_epi16(c, d);
@@ -178,7 +178,7 @@ inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpackhi_epi16(a, b);
     const __m128i r2 = _mm_unpackhi_epi16(c, d);
@@ -197,7 +197,7 @@ inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpackhi_epi16(a, b);
     const __m128i r2 = _mm_unpackhi_epi16(c, d);
@@ -216,7 +216,7 @@ inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c
 {
     // The shuffle converts to and from little-endian for SSE. A specialized
     // CHAM implementation can avoid the shuffle by framing the data for
-    // encryption, decrementryption and benchmarks. The library cannot take the
+    // encryption, decryption and benchmarks. The library cannot take the
     // speed-up because of the byte oriented API.
     const __m128i r1 = _mm_unpackhi_epi16(a, b);
     const __m128i r2 = _mm_unpackhi_epi16(c, d);