Browse Source

lz4block: No unaligned loads/stores in ARM decoder

On ARMv7, unaligned loads work fine, but I think Go still supports ARMv5
and I believe they would crash on that arch.

Performance doesn't suffer greatly (RPI4B):

name                old speed      new speed      delta
UncompressPg1661-4   136MB/s ± 2%   133MB/s ± 2%  -1.65%  (p=0.040 n=10+5)
UncompressDigits-4   262MB/s ± 2%   261MB/s ± 1%    ~     (p=0.679 n=10+5)
UncompressTwain-4    139MB/s ± 4%   136MB/s ± 3%    ~     (p=0.075 n=10+5)
UncompressRand-4     444MB/s ± 5%   451MB/s ± 0%    ~     (p=0.545 n=10+1)
greatroar 5 years ago
parent
commit
a9741b855b
1 changed files with 11 additions and 5 deletions
  1. 11 5
      internal/lz4block/decode_arm.s

+ 11 - 5
internal/lz4block/decode_arm.s

@@ -76,7 +76,9 @@ readLitlenDone:
 
 	TST        $2, src
 	MOVHU.NE.P 2(src), tmp2
-	MOVH.NE.P  tmp2, 2(dst)
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
 	SUB.NE     $2, len
 
 	CMP $4, len
@@ -84,7 +86,7 @@ readLitlenDone:
 
 copyLiteralLoop:
 	// Aligned load, unaligned write.
-	SUB   $4, len
+	SUB    $4, len
 	MOVW.P 4(src), tmp1
 	MOVW   tmp1 >>  8, tmp2
 	MOVB   tmp2, 1(dst)
@@ -100,10 +102,12 @@ copyLiteralFinish:
 	// Copy remaining 0-3 bytes.
 	TST        $2, len
 	MOVHU.NE.P 2(src), tmp2
-	MOVHU.NE.P tmp2, 2(dst)
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
 	TST        $1, len
 	MOVBU.NE.P 1(src), tmp1
-	MOVBU.NE.P tmp1, 1(dst)
+	MOVB.NE.P  tmp1, 1(dst)
 
 copyLiteralDone:
 	CMP src, srcend
@@ -113,7 +117,9 @@ copyLiteralDone:
 	ADD   $2, src
 	CMP   srcend, src
 	BHI   shortSrc
-	MOVHU -2(src), offset
+	MOVBU -2(src), offset
+	MOVBU -1(src), tmp1
+	ORR   tmp1 << 8, offset
 	CMP   $0, offset
 	BEQ   corrupt