Even if SSE doesn't gain us anything, going 32bit is a clear win.

2025-02-21 03:50:28 +00:00 · 2014-02-22 18:33:47 +01:00 · 2014-02-22 18:33:47 +01:00 · 30477c2518
commit 30477c2518
parent 9d8bf3cffa
1 changed files with 23 additions and 11 deletions
--- a/rewind.c
+++ b/rewind.c
@ -340,23 +340,35 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b)
 static inline size_t find_same(const uint16_t * a, const uint16_t * b)
 {
 	const uint16_t * a_org=a;
-	
-	//Comparing two or three words makes no real difference.
-	//With two, the smaller blocks are less likely to be chopped up elsewhere due to 64KB;
-	// with three, we get larger blocks which should be a minuscle bit faster to decompress,
-	// but probably a little slower to compress. Since compression is more bottleneck than decompression is, we favor that.
-	while (a[0]!=b[0] || a[1]!=b[1])
+#ifdef NO_UNALIGNED_MEM
+	if ((uintptr_t)a & (sizeof(uint32_t)-1) && *a!=*b)
 	{
 		a++;
 		b++;
-		//Optimize this by only checking one at the time for as long as possible.
-		while (*a!=*b)
+	}
+	if (*a!=*b)
+#endif
+	{
+		//With this, it's random whether two consecutive identical words are caught.
+		//Luckily, compression rate is the same for both cases, and three is always caught.
+		//(We prefer to miss two-word blocks, anyways; fewer iterations of the outer loop, as well as in the decompressor.)
+		const uint32_t* a_big=(const uint32_t*)a;
+		const uint32_t* b_big=(const uint32_t*)b;
+		
+		while (*a_big!=*b_big)
 		{
-			a++;
-			b++;
+			a_big++;
+			b_big++;
+		}
+		a=(const uint16_t*)a_big;
+		b=(const uint16_t*)b_big;
+		
+		if (a!=a_org && a[-1]==b[-1])
+		{
+			a--;
+			b--;
 		}
 	}
-	
 	return a-a_org;
 }
 #endif