Bug 1931957 - Update sha2 to 0.10.8 r=supply-chain-reviewers,keeler

This gives a big speed improvement on aarch64:
https://github.com/RustCrypto/hashes/pull/490#issue-1757989147

Differential Revision: https://phabricator.services.mozilla.com/D229400
This commit is contained in:
Jeff Muizelaar 2024-11-18 21:38:19 +00:00
parent 14a17176f8
commit 7cdbac8ef2
16 changed files with 932 additions and 22 deletions

8
Cargo.lock generated
View File

@ -1494,9 +1494,9 @@ dependencies = [
[[package]]
name = "digest"
version = "0.10.6"
version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
@ -5659,9 +5659,9 @@ dependencies = [
[[package]]
name = "sha2"
version = "0.10.6"
version = "0.10.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
dependencies = [
"cfg-if",
"cpufeatures",

View File

@ -4356,6 +4356,16 @@ who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy"
delta = "0.10.2 -> 0.10.6"
[[audits.sha2]]
who = "Jeff Muizelaar <jmuizelaar@mozilla.com>"
criteria = "safe-to-deploy"
delta = "0.10.6 -> 0.10.8"
notes = """
The bulk of this is https://github.com/RustCrypto/hashes/pull/490 which adds aarch64 support along with another PR adding longson.
I didn't check the implementation thoroughly but there wasn't anything obviously nefarious. 0.10.8 has been out for more than a year
which suggests no one else has found anything either.
"""
[[audits.sha3]]
who = "Simon Friedberger <simon@mozilla.com>"
criteria = "safe-to-deploy"

View File

@ -1607,6 +1607,11 @@ who = "David Cook <dcook@divviup.org>"
criteria = "safe-to-deploy"
version = "0.9.0"
[[audits.isrg.audits.digest]]
who = "David Cook <dcook@divviup.org>"
criteria = "safe-to-deploy"
delta = "0.10.6 -> 0.10.7"
[[audits.isrg.audits.getrandom]]
who = "Tim Geoghegan <timg@letsencrypt.org>"
criteria = "safe-to-deploy"

View File

@ -1 +1 @@
{"files":{"CHANGELOG.md":"cba0482b4328c05f545e94d6fea5d068b8c2e8c27abec3851b8fb567c6a0f562","Cargo.toml":"be0df25f7235deb18a52323de163e63bd5aefe4ad91ed276022d4757ccddeece","LICENSE-APACHE":"a9040321c3712d8fd0b09cf52b17445de04a23a10165049ae187cd39e5c86be5","LICENSE-MIT":"9e0dfd2dd4173a530e238cb6adb37aa78c34c6bc7444e0e10c1ab5d8881f63ba","README.md":"edf9f16c57466b06d201b8646182b7332324c7aba28f832dde7f57d03249637d","src/core_api.rs":"b52728aba8a84f980f3f9cc8a94a64d3a97f1eb5f4db144904822c2f8eefb1f8","src/core_api/ct_variable.rs":"703bd62fb693a437e319d1192988bd674f9127a6b76f73b4c58c71afc79bc013","src/core_api/rt_variable.rs":"b57f89bf3991a313e2ddde09c701375e23539e7df74d685a161707ba1fbc99e4","src/core_api/wrapper.rs":"033777bed7d140b158e15d50fda8a6e06557ce89bd0738fcca692be2c39e8b8a","src/core_api/xof_reader.rs":"f33ca7b2c17eb99d84ea460d5567af68690e4fa6c2d94069a5d6748f8c8620eb","src/dev.rs":"cbaeab07489efcadec917d7b7bcf2fdade79e78a4839ab3c3d8ad442f8f82833","src/dev/fixed.rs":"1cbabc651645c1e781d31825791132b4e3741f426e99d7e40988e2a5ee49bddd","src/dev/mac.rs":"e8837d3b99dc8b6ddb398e7fad5731c2ed36931f851ed625d3ae59fb31244165","src/dev/rng.rs":"156f42e9eb8fb2083cd12dc4a9bff9d57a321d33367efe6cd42cdc02c17ed2dc","src/dev/variable.rs":"51939602b43f5a813fc725bc603a34246bbf76facaa7930cb7bf78c283ec94a7","src/dev/xof.rs":"b3971175e50f615247e4158cba87d77c369461eda22751d888725cec45b61985","src/digest.rs":"8beab74640774c9f6811daa6dac9b5a8867f5beeb0b552a9b5ddbc5cfc196ed0","src/lib.rs":"5128199102bf0f7638fba0bbcf42b23822e31065841fb0c4304b64f681fde961","src/mac.rs":"6303caa2c5b76513346c082dd600e007354179ad440fc83dad3d7f4240281803"},"package":"8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"}
{"files":{"CHANGELOG.md":"e89e1b904ddcb022d2413bf6eb0cc14418c0177f669c4da5520c2566ebb3800c","Cargo.toml":"f0cc2fc18c6e7f3533658b9e59d71b55e641e7d8226e5d1b915a5e7c0a1d3f36","LICENSE-APACHE":"a9040321c3712d8fd0b09cf52b17445de04a23a10165049ae187cd39e5c86be5","LICENSE-MIT":"9e0dfd2dd4173a530e238cb6adb37aa78c34c6bc7444e0e10c1ab5d8881f63ba","README.md":"edf9f16c57466b06d201b8646182b7332324c7aba28f832dde7f57d03249637d","src/core_api.rs":"b52728aba8a84f980f3f9cc8a94a64d3a97f1eb5f4db144904822c2f8eefb1f8","src/core_api/ct_variable.rs":"703bd62fb693a437e319d1192988bd674f9127a6b76f73b4c58c71afc79bc013","src/core_api/rt_variable.rs":"b57f89bf3991a313e2ddde09c701375e23539e7df74d685a161707ba1fbc99e4","src/core_api/wrapper.rs":"033777bed7d140b158e15d50fda8a6e06557ce89bd0738fcca692be2c39e8b8a","src/core_api/xof_reader.rs":"f33ca7b2c17eb99d84ea460d5567af68690e4fa6c2d94069a5d6748f8c8620eb","src/dev.rs":"cbaeab07489efcadec917d7b7bcf2fdade79e78a4839ab3c3d8ad442f8f82833","src/dev/fixed.rs":"1cbabc651645c1e781d31825791132b4e3741f426e99d7e40988e2a5ee49bddd","src/dev/mac.rs":"e8837d3b99dc8b6ddb398e7fad5731c2ed36931f851ed625d3ae59fb31244165","src/dev/rng.rs":"156f42e9eb8fb2083cd12dc4a9bff9d57a321d33367efe6cd42cdc02c17ed2dc","src/dev/variable.rs":"51939602b43f5a813fc725bc603a34246bbf76facaa7930cb7bf78c283ec94a7","src/dev/xof.rs":"b3971175e50f615247e4158cba87d77c369461eda22751d888725cec45b61985","src/digest.rs":"8beab74640774c9f6811daa6dac9b5a8867f5beeb0b552a9b5ddbc5cfc196ed0","src/lib.rs":"5128199102bf0f7638fba0bbcf42b23822e31065841fb0c4304b64f681fde961","src/mac.rs":"6303caa2c5b76513346c082dd600e007354179ad440fc83dad3d7f4240281803"},"package":"9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"}

View File

@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## 0.10.7 (2023-05-19)
### Changed
- Loosen `subtle` version requirement ([#1260])
[#1260]: https://github.com/RustCrypto/traits/pull/1260
## 0.10.6 (2022-11-17)
### Added
- `Mac::verify_reset` and `Mac::verify_slice_reset` methods ([#1154])

View File

@ -12,7 +12,7 @@
[package]
edition = "2018"
name = "digest"
version = "0.10.6"
version = "0.10.7"
authors = ["RustCrypto Developers"]
description = "Traits for cryptographic hash functions and message authentication codes"
documentation = "https://docs.rs/digest"
@ -52,7 +52,7 @@ optional = true
version = "0.1.3"
[dependencies.subtle]
version = "=2.4"
version = "2.4"
optional = true
default-features = false

View File

@ -1 +1 @@
{"files":{"CHANGELOG.md":"604cc546b683e035e1f759479e41401c9035e7da6e07808f2cbd7702a07a5d26","Cargo.toml":"a6172879ad5aa1b7a6593e57db16c3d99c00563076239bf094b0d0f5ed6b30f8","LICENSE-APACHE":"a9040321c3712d8fd0b09cf52b17445de04a23a10165049ae187cd39e5c86be5","LICENSE-MIT":"b4eb00df6e2a4d22518fcaa6a2b4646f249b3a3c9814509b22bd2091f1392ff1","README.md":"b7af562922e4a631657acf264772d2af2b72a08d9bbc5fbcf56d9324f9027708","benches/mod.rs":"c32d9f91a541821ea988c14eee710963e623ef1edf69b02b41a29bc44e04ba95","src/consts.rs":"2f820349fa7cbf9fecc1d4aabbd1a721bb1badc3f32ef9e903826960b6f42523","src/core_api.rs":"73b160d98bfa6737688875ad73da5e3c2c93582604dc313d208200e12fdab676","src/lib.rs":"a286546dab99a51bdb3a5dc4edbd08fb9a57028cb422151f3a97441d113d7425","src/sha256.rs":"cfc2b62a412112e471781a770793f0ba0466594b2e37001334562f3d95f340ce","src/sha256/aarch64.rs":"02dbac483409a853126fec642f964a464e4372f53da2fa4120b29bed204f72b7","src/sha256/soft.rs":"98e765a8e8dfa0af31f2b76570f212e6b3099522bf300e1554cbbd9fd5d02960","src/sha256/x86.rs":"70f1597f2029522b35bfd026df0a8908f086523ab2a80ba3ef35e6231b56353c","src/sha512.rs":"92c4210a627b78505a195722b2f24bac5e6cfdece6292bf184ba8d42e7e2c35f","src/sha512/soft.rs":"0183ad89418b886859d2afa9bf061bc92759ae337c1d26147b4300042e63ef42","src/sha512/x86.rs":"c7dd8bdf3212e1e8c4cc9cc6b380dc0468f79dcfd0f61a445d0d38cead45a03a","tests/data/sha224.blb":"59b185972521af418fd49a079de3d5f5bed74cd76d80473da51cab3faee6c7d0","tests/data/sha256.blb":"bb096934bb7e43e41ce143d211397afca6fcdfe243a39811688ea31aae6f800a","tests/data/sha384.blb":"e8fe66c07ba336fae2c0aa4c87cb768f41bd4ed318ee1a36fbde0a68581946ec","tests/data/sha512.blb":"1cc0e86571f2f4e3bc81438ce7b6c25c118d2d7437355240113f59cbb782c8d6","tests/data/sha512_224.blb":"b02dd46741db1034112e0888d0cdb233a21b9a82c319456f806bbaae49acf440","tests/data/sha512_256.blb":"95195b758e362d92ff0cebebac4cca696512ea5811b635243bc70e29164e5786","tests/mod.rs":"61be596fd9b45a8db345950ff2ed6f87eaf4d239ac156885f36e819da0597644"},"package":"82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"}
{"files":{"CHANGELOG.md":"b7b0a14409ac2880f86fe50d9584acc81f2346ebcb4e46a9e2235b54ac5b02ef","Cargo.toml":"5fdf94b86fc47d105d2f2cc55c6346d15e7f3d2d7ea92031b1ce2d24276e7778","LICENSE-APACHE":"a9040321c3712d8fd0b09cf52b17445de04a23a10165049ae187cd39e5c86be5","LICENSE-MIT":"b4eb00df6e2a4d22518fcaa6a2b4646f249b3a3c9814509b22bd2091f1392ff1","README.md":"b7af562922e4a631657acf264772d2af2b72a08d9bbc5fbcf56d9324f9027708","benches/mod.rs":"c32d9f91a541821ea988c14eee710963e623ef1edf69b02b41a29bc44e04ba95","src/consts.rs":"2f820349fa7cbf9fecc1d4aabbd1a721bb1badc3f32ef9e903826960b6f42523","src/core_api.rs":"73b160d98bfa6737688875ad73da5e3c2c93582604dc313d208200e12fdab676","src/lib.rs":"9d0ec0ba86a801bd9b2024f0b84ee322a26c7376a623dd61210e0eb9d6355aa1","src/sha256.rs":"78e84eea5d517554aa5a10860bf2ce5013ca26d529e78643cd59062546e0746f","src/sha256/aarch64.rs":"18121a25867a575fec8ef64da763693ece4e3e3e84da095254b8471234c6f1f8","src/sha256/loongarch64_asm.rs":"79e2d5e3c039581e2319f8789de9ed13a8dd819ebffd13532dbd83448c7ad662","src/sha256/soft.rs":"98e765a8e8dfa0af31f2b76570f212e6b3099522bf300e1554cbbd9fd5d02960","src/sha256/x86.rs":"70f1597f2029522b35bfd026df0a8908f086523ab2a80ba3ef35e6231b56353c","src/sha512.rs":"1b19c23c63e9cfca8b42fd9e108a8570dd03e22a37d4d6f499f2fa5e566cb2de","src/sha512/aarch64.rs":"2ed929329a0fa66180e4726d028713a49f99cc223e635078fc1f3252a44981e0","src/sha512/loongarch64_asm.rs":"58a7b54d95a0e037ba80570d96ffe0dd7c0014c7fcb45b90725e522cc4992d8a","src/sha512/soft.rs":"0183ad89418b886859d2afa9bf061bc92759ae337c1d26147b4300042e63ef42","src/sha512/x86.rs":"c7dd8bdf3212e1e8c4cc9cc6b380dc0468f79dcfd0f61a445d0d38cead45a03a","tests/data/sha224.blb":"59b185972521af418fd49a079de3d5f5bed74cd76d80473da51cab3faee6c7d0","tests/data/sha256.blb":"bb096934bb7e43e41ce143d211397afca6fcdfe243a39811688ea31aae6f800a","tests/data/sha384.blb":"e8fe66c07ba336fae2c0aa4c87cb768f41bd4ed318ee1a36fbde0a68581946ec","tests/data/sha512.blb":"1cc0e86571f2f4e3bc81438ce7b6c25c118d2d7437355240113f59cbb782c8d6","tests/data/sha512_224.blb":"b02dd46741db1034112e0888d0cdb233a21b9a82c319456f806bbaae49acf440","tests/data/sha512_256.blb":"95195b758e362d92ff0cebebac4cca696512ea5811b635243bc70e29164e5786","tests/mod.rs":"61be596fd9b45a8db345950ff2ed6f87eaf4d239ac156885f36e819da0597644"},"package":"793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"}

View File

@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## 0.10.8 (2023-09-26)
### Added
- `asm!`-based backend for LoongArch64 targets gated behind `loongarch64_asm` feature [#507]
[#507]: https://github.com/RustCrypto/hashes/pull/507
## 0.10.7 (2023-06-15)
### Added
- AArch64 Neon-based backend ([#490])
[#490]: https://github.com/RustCrypto/hashes/pull/490
## 0.10.6 (2022-09-16)
### Added
- Feature-gated OID support ([#405])

View File

@ -3,35 +3,51 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "sha2"
version = "0.10.6"
version = "0.10.8"
authors = ["RustCrypto Developers"]
description = "Pure Rust implementation of the SHA-2 hash function family\nincluding SHA-224, SHA-256, SHA-384, and SHA-512.\n"
description = """
Pure Rust implementation of the SHA-2 hash function family
including SHA-224, SHA-256, SHA-384, and SHA-512.
"""
documentation = "https://docs.rs/sha2"
readme = "README.md"
keywords = ["crypto", "sha2", "hash", "digest"]
categories = ["cryptography", "no-std"]
keywords = [
"crypto",
"sha2",
"hash",
"digest",
]
categories = [
"cryptography",
"no-std",
]
license = "MIT OR Apache-2.0"
repository = "https://github.com/RustCrypto/hashes"
[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]
rustdoc-args = [
"--cfg",
"docsrs",
]
[dependencies.cfg-if]
version = "1.0"
[dependencies.digest]
version = "0.10.4"
version = "0.10.7"
[dev-dependencies.digest]
version = "0.10.4"
version = "0.10.7"
features = ["dev"]
[dev-dependencies.hex-literal]
@ -43,8 +59,10 @@ asm-aarch64 = ["asm"]
compress = []
default = ["std"]
force-soft = []
loongarch64_asm = []
oid = ["digest/oid"]
std = ["digest/std"]
[target."cfg(any(target_arch = \"aarch64\", target_arch = \"x86_64\", target_arch = \"x86\"))".dependencies.cpufeatures]
version = "0.2"

View File

@ -6,7 +6,8 @@
//! Algorithmically, there are only 2 core algorithms: SHA-256 and SHA-512.
//! All other algorithms are just applications of these with different initial
//! hash values, and truncated to different digest bit lengths. The first two
//! algorithms in the list are based on SHA-256, while the last three on SHA-512.
//! algorithms in the list are based on SHA-256, while the last four are based
//! on SHA-512.
//!
//! # Usage
//!

View File

@ -17,6 +17,9 @@ cfg_if::cfg_if! {
mod soft;
mod aarch64;
use aarch64::compress;
} else if #[cfg(all(feature = "loongarch64_asm", target_arch = "loongarch64"))] {
mod loongarch64_asm;
use loongarch64_asm::compress;
} else {
mod soft;
use soft::compress;

View File

@ -1,15 +1,159 @@
//! SHA-256 `aarch64` backend.
// Implementation adapted from mbedtls.
// TODO: stdarch intrinsics: RustCrypto/hashes#257
use core::arch::{aarch64::*, asm};
use crate::consts::K32;
cpufeatures::new!(sha2_hwcap, "sha2");
pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
// after stabilization
if sha2_hwcap::get() {
sha2_asm::compress256(state, blocks);
unsafe { sha256_compress(state, blocks) }
} else {
super::soft::compress(state, blocks);
}
}
#[target_feature(enable = "sha2")]
unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
// SAFETY: Requires the sha2 feature.
// Load state into vectors.
let mut abcd = vld1q_u32(state[0..4].as_ptr());
let mut efgh = vld1q_u32(state[4..8].as_ptr());
// Iterate through the message blocks.
for block in blocks {
// Keep original state values.
let abcd_orig = abcd;
let efgh_orig = efgh;
// Load the message block into vectors, assuming little endianness.
let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));
// Rounds 0 to 3
let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
let mut abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
// Rounds 4 to 7
tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
// Rounds 8 to 11
tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
// Rounds 12 to 15
tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
for t in (16..64).step_by(16) {
// Rounds t to t + 3
s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
// Rounds t + 4 to t + 7
s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
// Rounds t + 8 to t + 11
s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
// Rounds t + 12 to t + 15
s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
}
// Add the block-specific state to the original state.
abcd = vaddq_u32(abcd, abcd_orig);
efgh = vaddq_u32(efgh, efgh_orig);
}
// Store vectors into state.
vst1q_u32(state[0..4].as_mut_ptr(), abcd);
vst1q_u32(state[4..8].as_mut_ptr(), efgh);
}
// TODO remove these polyfills once SHA2 intrinsics land
#[inline(always)]
unsafe fn vsha256hq_u32(
mut hash_efgh: uint32x4_t,
hash_abcd: uint32x4_t,
wk: uint32x4_t,
) -> uint32x4_t {
asm!(
"SHA256H {:q}, {:q}, {:v}.4S",
inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
options(pure, nomem, nostack, preserves_flags)
);
hash_efgh
}
#[inline(always)]
unsafe fn vsha256h2q_u32(
mut hash_efgh: uint32x4_t,
hash_abcd: uint32x4_t,
wk: uint32x4_t,
) -> uint32x4_t {
asm!(
"SHA256H2 {:q}, {:q}, {:v}.4S",
inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
options(pure, nomem, nostack, preserves_flags)
);
hash_efgh
}
#[inline(always)]
unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
asm!(
"SHA256SU0 {:v}.4S, {:v}.4S",
inout(vreg) w0_3, in(vreg) w4_7,
options(pure, nomem, nostack, preserves_flags)
);
w0_3
}
#[inline(always)]
unsafe fn vsha256su1q_u32(
mut tw0_3: uint32x4_t,
w8_11: uint32x4_t,
w12_15: uint32x4_t,
) -> uint32x4_t {
asm!(
"SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
options(pure, nomem, nostack, preserves_flags)
);
tw0_3
}

View File

@ -0,0 +1,227 @@
//! LoongArch64 assembly backend
macro_rules! c {
($($l:expr)*) => {
concat!($($l ,)*)
};
}
macro_rules! rounda {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
"ld.w $a5, $a1, (" $i " * 4);"
"revb.2h $a5, $a5;"
"rotri.w $a5, $a5, 16;"
roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
)
};
}
macro_rules! roundb {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
"ld.w $a4, $sp, (((" $i " - 15) & 0xF) * 4);"
"ld.w $a5, $sp, (((" $i " - 16) & 0xF) * 4);"
"ld.w $a6, $sp, (((" $i " - 7) & 0xF) * 4);"
"add.w $a5, $a5, $a6;"
"rotri.w $a6, $a4, 18;"
"srli.w $a7, $a4, 3;"
"rotri.w $a4, $a4, 7;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.w $a5, $a5, $a4;"
"ld.w $a4, $sp, (((" $i " - 2) & 0xF) * 4);"
"rotri.w $a6, $a4, 19;"
"srli.w $a7, $a4, 10;"
"rotri.w $a4, $a4, 17;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.w $a5, $a5, $a4;"
roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
)
};
}
macro_rules! roundtail {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
// Part 0
"rotri.w $a6, " $e ", 11;"
"rotri.w $a7, " $e ", 25;"
"rotri.w $a4, " $e ", 6;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"xor $a6, " $g ", " $f ";"
"ld.w $a7, $a3, " $i " * 4;"
"and $a6, $a6, " $e ";"
"xor $a6, $a6, " $g ";"
"add.w $a4, $a4, $a6;"
"add.w $a4, $a4, $a7;"
"add.w " $h ", " $h ", $a5;"
"add.w " $h ", " $h ", $a4;"
// Part 1
"add.w " $d ", " $d ", " $h ";"
// Part 2
"rotri.w $a6, " $a ", 13;"
"rotri.w $a7, " $a ", 22;"
"rotri.w $a4, " $a ", 2;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.w " $h ", " $h ", $a4;"
"or $a4, " $c ", " $b ";"
"and $a6, " $c ", " $b ";"
"and $a4, $a4, " $a ";"
"or $a4, $a4, $a6;"
"add.w " $h ", " $h ", $a4;"
"st.w $a5, $sp, ((" $i " & 0xF) * 4);"
)
};
}
pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
if blocks.is_empty() {
return;
}
unsafe {
core::arch::asm!(
// Allocate scratch stack space
"addi.d $sp, $sp, -64;",
// Load state
"ld.w $t0, $a0, 0",
"ld.w $t1, $a0, 4",
"ld.w $t2, $a0, 8",
"ld.w $t3, $a0, 12",
"ld.w $t4, $a0, 16",
"ld.w $t5, $a0, 20",
"ld.w $t6, $a0, 24",
"ld.w $t7, $a0, 28",
"42:",
// Do 64 rounds of hashing
rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
// Update state registers
"ld.w $a4, $a0, 0", // a
"ld.w $a5, $a0, 4", // b
"ld.w $a6, $a0, 8", // c
"ld.w $a7, $a0, 12", // d
"add.w $t0, $t0, $a4",
"add.w $t1, $t1, $a5",
"add.w $t2, $t2, $a6",
"add.w $t3, $t3, $a7",
"ld.w $a4, $a0, 16", // e
"ld.w $a5, $a0, 20", // f
"ld.w $a6, $a0, 24", // g
"ld.w $a7, $a0, 28", // h
"add.w $t4, $t4, $a4",
"add.w $t5, $t5, $a5",
"add.w $t6, $t6, $a6",
"add.w $t7, $t7, $a7",
// Save updated state
"st.w $t0, $a0, 0",
"st.w $t1, $a0, 4",
"st.w $t2, $a0, 8",
"st.w $t3, $a0, 12",
"st.w $t4, $a0, 16",
"st.w $t5, $a0, 20",
"st.w $t6, $a0, 24",
"st.w $t7, $a0, 28",
// Looping over blocks
"addi.d $a1, $a1, 64",
"addi.d $a2, $a2, -1",
"bnez $a2, 42b",
// Restore stack register
"addi.d $sp, $sp, 64",
in("$a0") state,
inout("$a1") blocks.as_ptr() => _,
inout("$a2") blocks.len() => _,
in("$a3") crate::consts::K32.as_ptr(),
// Clobbers
out("$a4") _,
out("$a5") _,
out("$a6") _,
out("$a7") _,
out("$t0") _,
out("$t1") _,
out("$t2") _,
out("$t3") _,
out("$t4") _,
out("$t5") _,
out("$t6") _,
out("$t7") _,
options(preserves_flags),
);
}
}

View File

@ -15,6 +15,13 @@ cfg_if::cfg_if! {
}
mod x86;
use x86::compress;
} else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
mod soft;
mod aarch64;
use aarch64::compress;
} else if #[cfg(all(feature = "loongarch64_asm", target_arch = "loongarch64"))] {
mod loongarch64_asm;
use loongarch64_asm::compress;
} else {
mod soft;
use soft::compress;

View File

@ -0,0 +1,235 @@
// Implementation adapted from mbedtls.
use core::arch::{aarch64::*, asm};
use crate::consts::K64;
cpufeatures::new!(sha3_hwcap, "sha3");
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
// after stabilization
if sha3_hwcap::get() {
unsafe { sha512_compress(state, blocks) }
} else {
super::soft::compress(state, blocks);
}
}
#[target_feature(enable = "sha3")]
unsafe fn sha512_compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
// SAFETY: Requires the sha3 feature.
// Load state into vectors.
let mut ab = vld1q_u64(state[0..2].as_ptr());
let mut cd = vld1q_u64(state[2..4].as_ptr());
let mut ef = vld1q_u64(state[4..6].as_ptr());
let mut gh = vld1q_u64(state[6..8].as_ptr());
// Iterate through the message blocks.
for block in blocks {
// Keep original state values.
let ab_orig = ab;
let cd_orig = cd;
let ef_orig = ef;
let gh_orig = gh;
// Load the message block into vectors, assuming little endianness.
let mut s0 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[0..16].as_ptr())));
let mut s1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[16..32].as_ptr())));
let mut s2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[32..48].as_ptr())));
let mut s3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[48..64].as_ptr())));
let mut s4 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[64..80].as_ptr())));
let mut s5 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[80..96].as_ptr())));
let mut s6 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[96..112].as_ptr())));
let mut s7 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[112..128].as_ptr())));
// Rounds 0 and 1
let mut initial_sum = vaddq_u64(s0, vld1q_u64(&K64[0]));
let mut sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
let mut intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds 2 and 3
initial_sum = vaddq_u64(s1, vld1q_u64(&K64[2]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds 4 and 5
initial_sum = vaddq_u64(s2, vld1q_u64(&K64[4]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds 6 and 7
initial_sum = vaddq_u64(s3, vld1q_u64(&K64[6]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
// Rounds 8 and 9
initial_sum = vaddq_u64(s4, vld1q_u64(&K64[8]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds 10 and 11
initial_sum = vaddq_u64(s5, vld1q_u64(&K64[10]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds 12 and 13
initial_sum = vaddq_u64(s6, vld1q_u64(&K64[12]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds 14 and 15
initial_sum = vaddq_u64(s7, vld1q_u64(&K64[14]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
for t in (16..80).step_by(16) {
// Rounds t and t + 1
s0 = vsha512su1q_u64(vsha512su0q_u64(s0, s1), s7, vextq_u64(s4, s5, 1));
initial_sum = vaddq_u64(s0, vld1q_u64(&K64[t]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds t + 2 and t + 3
s1 = vsha512su1q_u64(vsha512su0q_u64(s1, s2), s0, vextq_u64(s5, s6, 1));
initial_sum = vaddq_u64(s1, vld1q_u64(&K64[t + 2]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds t + 4 and t + 5
s2 = vsha512su1q_u64(vsha512su0q_u64(s2, s3), s1, vextq_u64(s6, s7, 1));
initial_sum = vaddq_u64(s2, vld1q_u64(&K64[t + 4]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds t + 6 and t + 7
s3 = vsha512su1q_u64(vsha512su0q_u64(s3, s4), s2, vextq_u64(s7, s0, 1));
initial_sum = vaddq_u64(s3, vld1q_u64(&K64[t + 6]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
// Rounds t + 8 and t + 9
s4 = vsha512su1q_u64(vsha512su0q_u64(s4, s5), s3, vextq_u64(s0, s1, 1));
initial_sum = vaddq_u64(s4, vld1q_u64(&K64[t + 8]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds t + 10 and t + 11
s5 = vsha512su1q_u64(vsha512su0q_u64(s5, s6), s4, vextq_u64(s1, s2, 1));
initial_sum = vaddq_u64(s5, vld1q_u64(&K64[t + 10]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds t + 12 and t + 13
s6 = vsha512su1q_u64(vsha512su0q_u64(s6, s7), s5, vextq_u64(s2, s3, 1));
initial_sum = vaddq_u64(s6, vld1q_u64(&K64[t + 12]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds t + 14 and t + 15
s7 = vsha512su1q_u64(vsha512su0q_u64(s7, s0), s6, vextq_u64(s3, s4, 1));
initial_sum = vaddq_u64(s7, vld1q_u64(&K64[t + 14]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
}
// Add the block-specific state to the original state.
ab = vaddq_u64(ab, ab_orig);
cd = vaddq_u64(cd, cd_orig);
ef = vaddq_u64(ef, ef_orig);
gh = vaddq_u64(gh, gh_orig);
}
// Store vectors into state.
vst1q_u64(state[0..2].as_mut_ptr(), ab);
vst1q_u64(state[2..4].as_mut_ptr(), cd);
vst1q_u64(state[4..6].as_mut_ptr(), ef);
vst1q_u64(state[6..8].as_mut_ptr(), gh);
}
// TODO remove these polyfills once SHA3 intrinsics land
#[inline(always)]
unsafe fn vsha512hq_u64(
mut hash_ed: uint64x2_t,
hash_gf: uint64x2_t,
kwh_kwh2: uint64x2_t,
) -> uint64x2_t {
asm!(
"SHA512H {:q}, {:q}, {:v}.2D",
inout(vreg) hash_ed, in(vreg) hash_gf, in(vreg) kwh_kwh2,
options(pure, nomem, nostack, preserves_flags)
);
hash_ed
}
#[inline(always)]
unsafe fn vsha512h2q_u64(
mut sum_ab: uint64x2_t,
hash_c_: uint64x2_t,
hash_ab: uint64x2_t,
) -> uint64x2_t {
asm!(
"SHA512H2 {:q}, {:q}, {:v}.2D",
inout(vreg) sum_ab, in(vreg) hash_c_, in(vreg) hash_ab,
options(pure, nomem, nostack, preserves_flags)
);
sum_ab
}
#[inline(always)]
unsafe fn vsha512su0q_u64(mut w0_1: uint64x2_t, w2_: uint64x2_t) -> uint64x2_t {
asm!(
"SHA512SU0 {:v}.2D, {:v}.2D",
inout(vreg) w0_1, in(vreg) w2_,
options(pure, nomem, nostack, preserves_flags)
);
w0_1
}
#[inline(always)]
unsafe fn vsha512su1q_u64(
mut s01_s02: uint64x2_t,
w14_15: uint64x2_t,
w9_10: uint64x2_t,
) -> uint64x2_t {
asm!(
"SHA512SU1 {:v}.2D, {:v}.2D, {:v}.2D",
inout(vreg) s01_s02, in(vreg) w14_15, in(vreg) w9_10,
options(pure, nomem, nostack, preserves_flags)
);
s01_s02
}

View File

@ -0,0 +1,242 @@
//! LoongArch64 assembly backend
macro_rules! c {
($($l:expr)*) => {
concat!($($l ,)*)
};
}
macro_rules! rounda {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
"ld.d $a5, $a1, (" $i " * 8);"
"revb.d $a5, $a5;"
roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
)
};
}
macro_rules! roundb {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
"ld.d $a4, $sp, (((" $i " - 15) & 0xF) * 8);"
"ld.d $a5, $sp, (((" $i " - 16) & 0xF) * 8);"
"ld.d $a6, $sp, (((" $i " - 7) & 0xF) * 8);"
"add.d $a5, $a5, $a6;"
"rotri.d $a6, $a4, 8;"
"srli.d $a7, $a4, 7;"
"rotri.d $a4, $a4, 1;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.d $a5, $a5, $a4;"
"ld.d $a4, $sp, (((" $i " - 2) & 0xF) * 8);"
"rotri.d $a6, $a4, 61;"
"srli.d $a7, $a4, 6;"
"rotri.d $a4, $a4, 19;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.d $a5, $a5, $a4;"
roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
)
};
}
macro_rules! roundtail {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
// Part 0
"rotri.d $a6, " $e ", 18;"
"rotri.d $a7, " $e ", 41;"
"rotri.d $a4, " $e ", 14;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"xor $a6, " $g ", " $f ";"
"ld.d $a7, $a3, " $i " * 8;"
"and $a6, $a6, " $e ";"
"xor $a6, $a6, " $g ";"
"add.d $a4, $a4, $a6;"
"add.d $a4, $a4, $a7;"
"add.d " $h ", " $h ", $a5;"
"add.d " $h ", " $h ", $a4;"
// Part 1
"add.d " $d ", " $d ", " $h ";"
// Part 2
"rotri.d $a6, " $a ", 39;"
"rotri.d $a7, " $a ", 34;"
"rotri.d $a4, " $a ", 28;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.d " $h ", " $h ", $a4;"
"or $a4, " $c ", " $b ";"
"and $a6, " $c ", " $b ";"
"and $a4, $a4, " $a ";"
"or $a4, $a4, $a6;"
"add.d " $h ", " $h ", $a4;"
"st.d $a5, $sp, ((" $i " & 0xF) * 8);"
)
};
}
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
if blocks.is_empty() {
return;
}
unsafe {
core::arch::asm!(
// Allocate scratch stack space
"addi.d $sp, $sp, -128;",
// Load state
"ld.d $t0, $a0, 0",
"ld.d $t1, $a0, 8",
"ld.d $t2, $a0, 16",
"ld.d $t3, $a0, 24",
"ld.d $t4, $a0, 32",
"ld.d $t5, $a0, 40",
"ld.d $t6, $a0, 48",
"ld.d $t7, $a0, 56",
"42:",
// Do 64 rounds of hashing
rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(64, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(65, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(66, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(67, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(68, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(69, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(70, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(71, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(72, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(73, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(74, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(75, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(76, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(77, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(78, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(79, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
// Update state registers
"ld.d $a4, $a0, 0", // a
"ld.d $a5, $a0, 8", // b
"ld.d $a6, $a0, 16", // c
"ld.d $a7, $a0, 24", // d
"add.d $t0, $t0, $a4",
"add.d $t1, $t1, $a5",
"add.d $t2, $t2, $a6",
"add.d $t3, $t3, $a7",
"ld.d $a4, $a0, 32", // e
"ld.d $a5, $a0, 40", // f
"ld.d $a6, $a0, 48", // g
"ld.d $a7, $a0, 56", // h
"add.d $t4, $t4, $a4",
"add.d $t5, $t5, $a5",
"add.d $t6, $t6, $a6",
"add.d $t7, $t7, $a7",
// Save updated state
"st.d $t0, $a0, 0",
"st.d $t1, $a0, 8",
"st.d $t2, $a0, 16",
"st.d $t3, $a0, 24",
"st.d $t4, $a0, 32",
"st.d $t5, $a0, 40",
"st.d $t6, $a0, 48",
"st.d $t7, $a0, 56",
// Looping over blocks
"addi.d $a1, $a1, 128",
"addi.d $a2, $a2, -1",
"bnez $a2, 42b",
// Restore stack register
"addi.d $sp, $sp, 128",
in("$a0") state,
inout("$a1") blocks.as_ptr() => _,
inout("$a2") blocks.len() => _,
in("$a3") crate::consts::K64.as_ptr(),
// Clobbers
out("$a4") _,
out("$a5") _,
out("$a6") _,
out("$a7") _,
out("$t0") _,
out("$t1") _,
out("$t2") _,
out("$t3") _,
out("$t4") _,
out("$t5") _,
out("$t6") _,
out("$t7") _,
options(preserves_flags),
);
}
}