You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							249 lines
						
					
					
						
							7.9 KiB
						
					
					
				
			
		
		
	
	
							249 lines
						
					
					
						
							7.9 KiB
						
					
					
				| // Copyright 2016 The Go Authors. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style
 | |
| // license that can be found in the LICENSE file.
 | |
| 
 | |
| // +build s390x
 | |
| 
 | |
| #include "textflag.h"
 | |
| 
 | |
| // Vector register range containing CRC-32 constants
 | |
| 
 | |
| #define CONST_PERM_LE2BE        V9
 | |
| #define CONST_R2R1              V10
 | |
| #define CONST_R4R3              V11
 | |
| #define CONST_R5                V12
 | |
| #define CONST_RU_POLY           V13
 | |
| #define CONST_CRC_POLY          V14
 | |
| 
 | |
| // The CRC-32 constant block contains reduction constants to fold and
 | |
| // process particular chunks of the input data stream in parallel.
 | |
| //
 | |
| // Note that the constant definitions below are extended in order to compute
 | |
| // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
 | |
| // The rightmost doubleword can be 0 to prevent contribution to the result or
 | |
| // can be multiplied by 1 to perform an XOR without the need for a separate
 | |
| // VECTOR EXCLUSIVE OR instruction.
 | |
| //
 | |
| // The polynomials used are bit-reflected:
 | |
| //
 | |
| //            IEEE: P'(x) = 0x0edb88320
 | |
| //      Castagnoli: P'(x) = 0x082f63b78
 | |
| 
 | |
| // IEEE polynomial constants
 | |
| DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
 | |
| DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
 | |
| DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
 | |
| DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
 | |
| DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
 | |
| DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
 | |
| DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
 | |
| DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
 | |
| DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
 | |
| DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
 | |
| DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
 | |
| DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
 | |
| 
 | |
| GLOBL ·crcleconskp(SB), RODATA, $144
 | |
| 
 | |
| // Castagonli Polynomial constants
 | |
| DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
 | |
| DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
 | |
| DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
 | |
| DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
 | |
| DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
 | |
| DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
 | |
| DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
 | |
| DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
 | |
| DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
 | |
| DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
 | |
| DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
 | |
| DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
 | |
| 
 | |
| GLOBL ·crccleconskp(SB), RODATA, $144
 | |
| 
 | |
| // func hasVectorFacility() bool
 | |
| TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
 | |
| 	MOVD  $x-24(SP), R1
 | |
| 	XC    $24, 0(R1), 0(R1) // clear the storage
 | |
| 	MOVD  $2, R0            // R0 is the number of double words stored -1
 | |
| 	WORD  $0xB2B01000       // STFLE 0(R1)
 | |
| 	XOR   R0, R0            // reset the value of R0
 | |
| 	MOVBZ z-8(SP), R1
 | |
| 	AND   $0x40, R1
 | |
| 	BEQ   novector
 | |
| 
 | |
| vectorinstalled:
 | |
| 	// check if the vector instruction has been enabled
 | |
| 	VLEIB  $0, $0xF, V16
 | |
| 	VLGVB  $0, V16, R1
 | |
| 	CMPBNE R1, $0xF, novector
 | |
| 	MOVB   $1, ret+0(FP)      // have vx
 | |
| 	RET
 | |
| 
 | |
| novector:
 | |
| 	MOVB $0, ret+0(FP) // no vx
 | |
| 	RET
 | |
| 
 | |
| // The CRC-32 function(s) use these calling conventions:
 | |
| //
 | |
| // Parameters:
 | |
| //
 | |
| //      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
 | |
| //      R3:    Input buffer pointer, performance might be improved if the
 | |
| //             buffer is on a doubleword boundary.
 | |
| //      R4:    Length of the buffer, must be 64 bytes or greater.
 | |
| //
 | |
| // Register usage:
 | |
| //
 | |
| //      R5:     CRC-32 constant pool base pointer.
 | |
| //      V0:     Initial CRC value and intermediate constants and results.
 | |
| //      V1..V4: Data for CRC computation.
 | |
| //      V5..V8: Next data chunks that are fetched from the input buffer.
 | |
| //
 | |
| //      V9..V14: CRC-32 constants.
 | |
| 
 | |
| // func vectorizedIEEE(crc uint32, p []byte) uint32
 | |
| TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
 | |
| 	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
 | |
| 	MOVD  p+8(FP), R3      // data pointer
 | |
| 	MOVD  p_len+16(FP), R4 // len(p)
 | |
| 
 | |
| 	MOVD $·crcleconskp(SB), R5
 | |
| 	BR   vectorizedBody<>(SB)
 | |
| 
 | |
| // func vectorizedCastagnoli(crc uint32, p []byte) uint32
 | |
| TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
 | |
| 	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
 | |
| 	MOVD  p+8(FP), R3      // data pointer
 | |
| 	MOVD  p_len+16(FP), R4 // len(p)
 | |
| 
 | |
| 	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
 | |
| 	MOVD $·crccleconskp(SB), R5
 | |
| 	BR   vectorizedBody<>(SB)
 | |
| 
 | |
| TEXT vectorizedBody<>(SB), NOSPLIT, $0
 | |
| 	XOR $0xffffffff, R2                         // NOTW R2
 | |
| 	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
 | |
| 
 | |
| 	// Load the initial CRC value into the rightmost word of V0
 | |
| 	VZERO V0
 | |
| 	VLVGF $3, R2, V0
 | |
| 
 | |
| 	// Crash if the input size is less than 64-bytes.
 | |
| 	CMP R4, $64
 | |
| 	BLT crash
 | |
| 
 | |
| 	// Load a 64-byte data chunk and XOR with CRC
 | |
| 	VLM 0(R3), V1, V4 // 64-bytes into V1..V4
 | |
| 
 | |
| 	// Reflect the data if the CRC operation is in the bit-reflected domain
 | |
| 	VPERM V1, V1, CONST_PERM_LE2BE, V1
 | |
| 	VPERM V2, V2, CONST_PERM_LE2BE, V2
 | |
| 	VPERM V3, V3, CONST_PERM_LE2BE, V3
 | |
| 	VPERM V4, V4, CONST_PERM_LE2BE, V4
 | |
| 
 | |
| 	VX  V0, V1, V1 // V1 ^= CRC
 | |
| 	ADD $64, R3    // BUF = BUF + 64
 | |
| 	ADD $(-64), R4
 | |
| 
 | |
| 	// Check remaining buffer size and jump to proper folding method
 | |
| 	CMP R4, $64
 | |
| 	BLT less_than_64bytes
 | |
| 
 | |
| fold_64bytes_loop:
 | |
| 	// Load the next 64-byte data chunk into V5 to V8
 | |
| 	VLM   0(R3), V5, V8
 | |
| 	VPERM V5, V5, CONST_PERM_LE2BE, V5
 | |
| 	VPERM V6, V6, CONST_PERM_LE2BE, V6
 | |
| 	VPERM V7, V7, CONST_PERM_LE2BE, V7
 | |
| 	VPERM V8, V8, CONST_PERM_LE2BE, V8
 | |
| 
 | |
| 	// Perform a GF(2) multiplication of the doublewords in V1 with
 | |
| 	// the reduction constants in V0.  The intermediate result is
 | |
| 	// then folded (accumulated) with the next data chunk in V5 and
 | |
| 	// stored in V1.  Repeat this step for the register contents
 | |
| 	// in V2, V3, and V4 respectively.
 | |
| 
 | |
| 	VGFMAG CONST_R2R1, V1, V5, V1
 | |
| 	VGFMAG CONST_R2R1, V2, V6, V2
 | |
| 	VGFMAG CONST_R2R1, V3, V7, V3
 | |
| 	VGFMAG CONST_R2R1, V4, V8, V4
 | |
| 
 | |
| 	// Adjust buffer pointer and length for next loop
 | |
| 	ADD $64, R3    // BUF = BUF + 64
 | |
| 	ADD $(-64), R4 // LEN = LEN - 64
 | |
| 
 | |
| 	CMP R4, $64
 | |
| 	BGE fold_64bytes_loop
 | |
| 
 | |
| less_than_64bytes:
 | |
| 	// Fold V1 to V4 into a single 128-bit value in V1
 | |
| 	VGFMAG CONST_R4R3, V1, V2, V1
 | |
| 	VGFMAG CONST_R4R3, V1, V3, V1
 | |
| 	VGFMAG CONST_R4R3, V1, V4, V1
 | |
| 
 | |
| 	// Check whether to continue with 64-bit folding
 | |
| 	CMP R4, $16
 | |
| 	BLT final_fold
 | |
| 
 | |
| fold_16bytes_loop:
 | |
| 	VL    0(R3), V2                    // Load next data chunk
 | |
| 	VPERM V2, V2, CONST_PERM_LE2BE, V2
 | |
| 
 | |
| 	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
 | |
| 
 | |
| 	// Adjust buffer pointer and size for folding next data chunk
 | |
| 	ADD $16, R3
 | |
| 	ADD $-16, R4
 | |
| 
 | |
| 	// Process remaining data chunks
 | |
| 	CMP R4, $16
 | |
| 	BGE fold_16bytes_loop
 | |
| 
 | |
| final_fold:
 | |
| 	VLEIB $7, $0x40, V9
 | |
| 	VSRLB V9, CONST_R4R3, V0
 | |
| 	VLEIG $0, $1, V0
 | |
| 
 | |
| 	VGFMG V0, V1, V1
 | |
| 
 | |
| 	VLEIB  $7, $0x20, V9        // Shift by words
 | |
| 	VSRLB  V9, V1, V2           // Store remaining bits in V2
 | |
| 	VUPLLF V1, V1               // Split rightmost doubleword
 | |
| 	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
 | |
| 
 | |
| 	// The input values to the Barret reduction are the degree-63 polynomial
 | |
| 	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
 | |
| 	// constant u.  The Barret reduction result is the CRC value of R(x) mod
 | |
| 	// P(x).
 | |
| 	//
 | |
| 	// The Barret reduction algorithm is defined as:
 | |
| 	//
 | |
| 	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
 | |
| 	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
 | |
| 	//    3. C(x)  = R(x) XOR T2(x) mod x^32
 | |
| 	//
 | |
| 	// Note: To compensate the division by x^32, use the vector unpack
 | |
| 	// instruction to move the leftmost word into the leftmost doubleword
 | |
| 	// of the vector register.  The rightmost doubleword is multiplied
 | |
| 	// with zero to not contribute to the intermedate results.
 | |
| 
 | |
| 	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
 | |
| 	VUPLLF V1, V2
 | |
| 	VGFMG  CONST_RU_POLY, V2, V2
 | |
| 
 | |
| 	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
 | |
| 	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
 | |
| 	// The final result is in the rightmost word of V2.
 | |
| 
 | |
| 	VUPLLF V2, V2
 | |
| 	VGFMAG CONST_CRC_POLY, V2, V1, V2
 | |
| 
 | |
| done:
 | |
| 	VLGVF $2, V2, R2
 | |
| 	XOR   $0xffffffff, R2  // NOTW R2
 | |
| 	MOVWZ R2, ret + 32(FP)
 | |
| 	RET
 | |
| 
 | |
| crash:
 | |
| 	MOVD $0, (R0) // input size is less than 64-bytes
 | |
| 
 |