From fc64f09f0b2016c88bf93308c5e81c9509c2027d Mon Sep 17 00:00:00 2001
From: Delyan Angelov <delian66@gmail.com>
Date: Mon, 30 May 2022 21:56:39 +0300
Subject: [PATCH] crypto.md5: improve performance of md5.blockblock_generic

---
 vlib/crypto/md5/md5block_generic.v | 42 ++++++++++++++++--------------
 vlib/encoding/binary/binary.v      | 24 ++++++++---------
 vlib/math/bits/bits.v              | 27 ++++++++++---------
 3 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/vlib/crypto/md5/md5block_generic.v b/vlib/crypto/md5/md5block_generic.v
index f4020a177..da7ebad16 100644
--- a/vlib/crypto/md5/md5block_generic.v
+++ b/vlib/crypto/md5/md5block_generic.v
@@ -9,8 +9,14 @@
 module md5
 
 import math.bits
-import encoding.binary
 
+[direct_array_access; inline]
+fn get_le_u32(b []u8, start int) u32 {
+	return u32(b[start]) | (u32(b[1 + start]) << u32(8)) | (u32(b[2 + start]) << u32(16)) | (u32(b[
+		3 + start]) << u32(24))
+}
+
+[direct_array_access]
 fn block_generic(mut dig Digest, p []u8) {
 	// load state
 	mut a := dig.s[0]
@@ -19,8 +25,6 @@ fn block_generic(mut dig Digest, p []u8) {
 	mut d := dig.s[3]
 
 	for i := 0; i <= p.len - block_size; i += block_size {
-		mut q := p[i..]
-		q = q[..block_size]
 		// save current state
 		aa := a
 		bb := b
@@ -28,22 +32,22 @@ fn block_generic(mut dig Digest, p []u8) {
 		dd := d
 
 		// load input block
-		x0 := binary.little_endian_u32(q[4 * 0x0..])
-		x1 := binary.little_endian_u32(q[4 * 0x1..])
-		x2 := binary.little_endian_u32(q[4 * 0x2..])
-		x3 := binary.little_endian_u32(q[4 * 0x3..])
-		x4 := binary.little_endian_u32(q[4 * 0x4..])
-		x5 := binary.little_endian_u32(q[4 * 0x5..])
-		x6 := binary.little_endian_u32(q[4 * 0x6..])
-		x7 := binary.little_endian_u32(q[4 * 0x7..])
-		x8 := binary.little_endian_u32(q[4 * 0x8..])
-		x9 := binary.little_endian_u32(q[4 * 0x9..])
-		xa := binary.little_endian_u32(q[4 * 0xa..])
-		xb := binary.little_endian_u32(q[4 * 0xb..])
-		xc := binary.little_endian_u32(q[4 * 0xc..])
-		xd := binary.little_endian_u32(q[4 * 0xd..])
-		xe := binary.little_endian_u32(q[4 * 0xe..])
-		xf := binary.little_endian_u32(q[4 * 0xf..])
+		x0 := get_le_u32(p, 4 * 0x0 + i)
+		x1 := get_le_u32(p, 4 * 0x1 + i)
+		x2 := get_le_u32(p, 4 * 0x2 + i)
+		x3 := get_le_u32(p, 4 * 0x3 + i)
+		x4 := get_le_u32(p, 4 * 0x4 + i)
+		x5 := get_le_u32(p, 4 * 0x5 + i)
+		x6 := get_le_u32(p, 4 * 0x6 + i)
+		x7 := get_le_u32(p, 4 * 0x7 + i)
+		x8 := get_le_u32(p, 4 * 0x8 + i)
+		x9 := get_le_u32(p, 4 * 0x9 + i)
+		xa := get_le_u32(p, 4 * 0xa + i)
+		xb := get_le_u32(p, 4 * 0xb + i)
+		xc := get_le_u32(p, 4 * 0xc + i)
+		xd := get_le_u32(p, 4 * 0xd + i)
+		xe := get_le_u32(p, 4 * 0xe + i)
+		xf := get_le_u32(p, 4 * 0xf + i)
 
 		// round 1
 		a = b + bits.rotate_left_32((((c ^ d) & b) ^ d) + a + x0 + u32(0xd76aa478), 7)
diff --git a/vlib/encoding/binary/binary.v b/vlib/encoding/binary/binary.v
index 1aa8c2eef..5f5478ee1 100644
--- a/vlib/encoding/binary/binary.v
+++ b/vlib/encoding/binary/binary.v
@@ -4,26 +4,26 @@
 module binary
 
 // Little Endian
-[inline]
+[direct_array_access; inline]
 pub fn little_endian_u16(b []u8) u16 {
 	_ = b[1] // bounds check
 	return u16(b[0]) | (u16(b[1]) << u16(8))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn little_endian_put_u16(mut b []u8, v u16) {
 	_ = b[1] // bounds check
 	b[0] = u8(v)
 	b[1] = u8(v >> u16(8))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn little_endian_u32(b []u8) u32 {
 	_ = b[3] // bounds check
 	return u32(b[0]) | (u32(b[1]) << u32(8)) | (u32(b[2]) << u32(16)) | (u32(b[3]) << u32(24))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn little_endian_put_u32(mut b []u8, v u32) {
 	_ = b[3] // bounds check
 	b[0] = u8(v)
@@ -32,13 +32,13 @@ pub fn little_endian_put_u32(mut b []u8, v u32) {
 	b[3] = u8(v >> u32(24))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn little_endian_u64(b []u8) u64 {
 	_ = b[7] // bounds check
 	return u64(b[0]) | (u64(b[1]) << u64(8)) | (u64(b[2]) << u64(16)) | (u64(b[3]) << u64(24)) | (u64(b[4]) << u64(32)) | (u64(b[5]) << u64(40)) | (u64(b[6]) << u64(48)) | (u64(b[7]) << u64(56))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn little_endian_put_u64(mut b []u8, v u64) {
 	_ = b[7] // bounds check
 	b[0] = u8(v)
@@ -52,26 +52,26 @@ pub fn little_endian_put_u64(mut b []u8, v u64) {
 }
 
 // Big Endian
-[inline]
+[direct_array_access; inline]
 pub fn big_endian_u16(b []u8) u16 {
 	_ = b[1] // bounds check
 	return u16(b[1]) | (u16(b[0]) << u16(8))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn big_endian_put_u16(mut b []u8, v u16) {
 	_ = b[1] // bounds check
 	b[0] = u8(v >> u16(8))
 	b[1] = u8(v)
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn big_endian_u32(b []u8) u32 {
 	_ = b[3] // bounds check
 	return u32(b[3]) | (u32(b[2]) << u32(8)) | (u32(b[1]) << u32(16)) | (u32(b[0]) << u32(24))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn big_endian_put_u32(mut b []u8, v u32) {
 	_ = b[3] // bounds check
 	b[0] = u8(v >> u32(24))
@@ -80,13 +80,13 @@ pub fn big_endian_put_u32(mut b []u8, v u32) {
 	b[3] = u8(v)
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn big_endian_u64(b []u8) u64 {
 	_ = b[7] // bounds check
 	return u64(b[7]) | (u64(b[6]) << u64(8)) | (u64(b[5]) << u64(16)) | (u64(b[4]) << u64(24)) | (u64(b[3]) << u64(32)) | (u64(b[2]) << u64(40)) | (u64(b[1]) << u64(48)) | (u64(b[0]) << u64(56))
 }
 
-[inline]
+[direct_array_access; inline]
 pub fn big_endian_put_u64(mut b []u8, v u64) {
 	_ = b[7] // bounds check
 	b[0] = u8(v >> u64(56))
diff --git a/vlib/math/bits/bits.v b/vlib/math/bits/bits.v
index b75e2e549..65753b0e1 100644
--- a/vlib/math/bits/bits.v
+++ b/vlib/math/bits/bits.v
@@ -139,6 +139,13 @@ pub fn ones_count_64(x u64) int {
 	return int(y) & ((1 << 7) - 1)
 }
 
+const (
+	n8  = u8(8)
+	n16 = u16(16)
+	n32 = u32(32)
+	n64 = u64(64)
+)
+
 // --- RotateLeft ---
 // rotate_left_8 returns the value of x rotated left by (k mod 8) bits.
 // To rotate x right by k bits, call rotate_left_8(x, -k).
@@ -146,9 +153,8 @@ pub fn ones_count_64(x u64) int {
 // This function's execution time does not depend on the inputs.
 [inline]
 pub fn rotate_left_8(x u8, k int) u8 {
-	n := u8(8)
-	s := u8(k) & (n - u8(1))
-	return (x << s) | (x >> (n - s))
+	s := u8(k) & (bits.n8 - u8(1))
+	return (x << s) | (x >> (bits.n8 - s))
 }
 
 // rotate_left_16 returns the value of x rotated left by (k mod 16) bits.
@@ -157,9 +163,8 @@ pub fn rotate_left_8(x u8, k int) u8 {
 // This function's execution time does not depend on the inputs.
 [inline]
 pub fn rotate_left_16(x u16, k int) u16 {
-	n := u16(16)
-	s := u16(k) & (n - u16(1))
-	return (x << s) | (x >> (n - s))
+	s := u16(k) & (bits.n16 - u16(1))
+	return (x << s) | (x >> (bits.n16 - s))
 }
 
 // rotate_left_32 returns the value of x rotated left by (k mod 32) bits.
@@ -168,9 +173,8 @@ pub fn rotate_left_16(x u16, k int) u16 {
 // This function's execution time does not depend on the inputs.
 [inline]
 pub fn rotate_left_32(x u32, k int) u32 {
-	n := u32(32)
-	s := u32(k) & (n - u32(1))
-	return (x << s) | (x >> (n - s))
+	s := u32(k) & (bits.n32 - u32(1))
+	return (x << s) | (x >> (bits.n32 - s))
 }
 
 // rotate_left_64 returns the value of x rotated left by (k mod 64) bits.
@@ -179,9 +183,8 @@ pub fn rotate_left_32(x u32, k int) u32 {
 // This function's execution time does not depend on the inputs.
 [inline]
 pub fn rotate_left_64(x u64, k int) u64 {
-	n := u64(64)
-	s := u64(k) & (n - u64(1))
-	return (x << s) | (x >> (n - s))
+	s := u64(k) & (bits.n64 - u64(1))
+	return (x << s) | (x >> (bits.n64 - s))
 }
 
 // --- Reverse ---
-- 
2.30.2