You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
274 lines
6.9 KiB
274 lines
6.9 KiB
// Copyright 2015, Joe Tsai. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE.md file.
|
|
|
|
package bzip2
|
|
|
|
import (
|
|
"io"
|
|
|
|
"github.com/dsnet/compress/internal"
|
|
"github.com/dsnet/compress/internal/errors"
|
|
"github.com/dsnet/compress/internal/prefix"
|
|
)
|
|
|
|
type Reader struct {
|
|
InputOffset int64 // Total number of bytes read from underlying io.Reader
|
|
OutputOffset int64 // Total number of bytes emitted from Read
|
|
|
|
rd prefixReader
|
|
err error
|
|
level int // The current compression level
|
|
rdHdrFtr int // Number of times we read the stream header and footer
|
|
blkCRC uint32 // CRC-32 IEEE of each block (as stored)
|
|
endCRC uint32 // Checksum of all blocks using bzip2's custom method
|
|
|
|
crc crc
|
|
mtf moveToFront
|
|
bwt burrowsWheelerTransform
|
|
rle runLengthEncoding
|
|
|
|
// These fields are allocated with Reader and re-used later.
|
|
treeSels []uint8
|
|
codes2D [maxNumTrees][maxNumSyms]prefix.PrefixCode
|
|
codes1D [maxNumTrees]prefix.PrefixCodes
|
|
trees1D [maxNumTrees]prefix.Decoder
|
|
syms []uint16
|
|
|
|
fuzzReader // Exported functionality when fuzz testing
|
|
}
|
|
|
|
type ReaderConfig struct {
|
|
_ struct{} // Blank field to prevent unkeyed struct literals
|
|
}
|
|
|
|
func NewReader(r io.Reader, conf *ReaderConfig) (*Reader, error) {
|
|
zr := new(Reader)
|
|
zr.Reset(r)
|
|
return zr, nil
|
|
}
|
|
|
|
func (zr *Reader) Reset(r io.Reader) error {
|
|
*zr = Reader{
|
|
rd: zr.rd,
|
|
|
|
mtf: zr.mtf,
|
|
bwt: zr.bwt,
|
|
rle: zr.rle,
|
|
|
|
treeSels: zr.treeSels,
|
|
trees1D: zr.trees1D,
|
|
syms: zr.syms,
|
|
}
|
|
zr.rd.Init(r)
|
|
return nil
|
|
}
|
|
|
|
func (zr *Reader) Read(buf []byte) (int, error) {
|
|
for {
|
|
cnt, err := zr.rle.Read(buf)
|
|
if err != rleDone && zr.err == nil {
|
|
zr.err = err
|
|
}
|
|
if cnt > 0 {
|
|
zr.crc.update(buf[:cnt])
|
|
zr.OutputOffset += int64(cnt)
|
|
return cnt, nil
|
|
}
|
|
if zr.err != nil || len(buf) == 0 {
|
|
return 0, zr.err
|
|
}
|
|
|
|
// Read the next chunk.
|
|
zr.rd.Offset = zr.InputOffset
|
|
func() {
|
|
defer errors.Recover(&zr.err)
|
|
if zr.rdHdrFtr%2 == 0 {
|
|
// Check if we are already at EOF.
|
|
if err := zr.rd.PullBits(1); err != nil {
|
|
if err == io.ErrUnexpectedEOF && zr.rdHdrFtr > 0 {
|
|
err = io.EOF // EOF is okay if we read at least one stream
|
|
}
|
|
errors.Panic(err)
|
|
}
|
|
|
|
// Read stream header.
|
|
if zr.rd.ReadBitsBE64(16) != hdrMagic {
|
|
panicf(errors.Corrupted, "invalid stream magic")
|
|
}
|
|
if ver := zr.rd.ReadBitsBE64(8); ver != 'h' {
|
|
if ver == '0' {
|
|
panicf(errors.Deprecated, "bzip1 format is not supported")
|
|
}
|
|
panicf(errors.Corrupted, "invalid version: %q", ver)
|
|
}
|
|
lvl := int(zr.rd.ReadBitsBE64(8)) - '0'
|
|
if lvl < BestSpeed || lvl > BestCompression {
|
|
panicf(errors.Corrupted, "invalid block size: %d", lvl*blockSize)
|
|
}
|
|
zr.level = lvl
|
|
zr.rdHdrFtr++
|
|
} else {
|
|
// Check and update the CRC.
|
|
if internal.GoFuzz {
|
|
zr.updateChecksum(-1, zr.crc.val) // Update with value
|
|
zr.blkCRC = zr.crc.val // Suppress CRC failures
|
|
}
|
|
if zr.blkCRC != zr.crc.val {
|
|
panicf(errors.Corrupted, "mismatching block checksum")
|
|
}
|
|
zr.endCRC = (zr.endCRC<<1 | zr.endCRC>>31) ^ zr.blkCRC
|
|
}
|
|
buf := zr.decodeBlock()
|
|
zr.rle.Init(buf)
|
|
}()
|
|
if zr.InputOffset, err = zr.rd.Flush(); zr.err == nil {
|
|
zr.err = err
|
|
}
|
|
if zr.err != nil {
|
|
zr.err = errWrap(zr.err, errors.Corrupted)
|
|
return 0, zr.err
|
|
}
|
|
}
|
|
}
|
|
|
|
func (zr *Reader) Close() error {
|
|
if zr.err == io.EOF || zr.err == errClosed {
|
|
zr.rle.Init(nil) // Make sure future reads fail
|
|
zr.err = errClosed
|
|
return nil
|
|
}
|
|
return zr.err // Return the persistent error
|
|
}
|
|
|
|
func (zr *Reader) decodeBlock() []byte {
|
|
if magic := zr.rd.ReadBitsBE64(48); magic != blkMagic {
|
|
if magic == endMagic {
|
|
endCRC := uint32(zr.rd.ReadBitsBE64(32))
|
|
if internal.GoFuzz {
|
|
zr.updateChecksum(zr.rd.BitsRead()-32, zr.endCRC)
|
|
endCRC = zr.endCRC // Suppress CRC failures
|
|
}
|
|
if zr.endCRC != endCRC {
|
|
panicf(errors.Corrupted, "mismatching stream checksum")
|
|
}
|
|
zr.endCRC = 0
|
|
zr.rd.ReadPads()
|
|
zr.rdHdrFtr++
|
|
return nil
|
|
}
|
|
panicf(errors.Corrupted, "invalid block or footer magic")
|
|
}
|
|
|
|
zr.crc.val = 0
|
|
zr.blkCRC = uint32(zr.rd.ReadBitsBE64(32))
|
|
if internal.GoFuzz {
|
|
zr.updateChecksum(zr.rd.BitsRead()-32, 0) // Record offset only
|
|
}
|
|
if zr.rd.ReadBitsBE64(1) != 0 {
|
|
panicf(errors.Deprecated, "block randomization is not supported")
|
|
}
|
|
|
|
// Read BWT related fields.
|
|
ptr := int(zr.rd.ReadBitsBE64(24)) // BWT origin pointer
|
|
|
|
// Read MTF related fields.
|
|
var dictArr [256]uint8
|
|
dict := dictArr[:0]
|
|
bmapHi := uint16(zr.rd.ReadBits(16))
|
|
for i := 0; i < 256; i, bmapHi = i+16, bmapHi>>1 {
|
|
if bmapHi&1 > 0 {
|
|
bmapLo := uint16(zr.rd.ReadBits(16))
|
|
for j := 0; j < 16; j, bmapLo = j+1, bmapLo>>1 {
|
|
if bmapLo&1 > 0 {
|
|
dict = append(dict, uint8(i+j))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 1: Prefix encoding.
|
|
syms := zr.decodePrefix(len(dict))
|
|
|
|
// Step 2: Move-to-front transform and run-length encoding.
|
|
zr.mtf.Init(dict, zr.level*blockSize)
|
|
buf := zr.mtf.Decode(syms)
|
|
|
|
// Step 3: Burrows-Wheeler transformation.
|
|
if ptr >= len(buf) {
|
|
panicf(errors.Corrupted, "origin pointer (0x%06x) exceeds block size: %d", ptr, len(buf))
|
|
}
|
|
zr.bwt.Decode(buf, ptr)
|
|
|
|
return buf
|
|
}
|
|
|
|
func (zr *Reader) decodePrefix(numSyms int) (syms []uint16) {
|
|
numSyms += 2 // Remove 0 symbol, add RUNA, RUNB, and EOF symbols
|
|
if numSyms < 3 {
|
|
panicf(errors.Corrupted, "not enough prefix symbols: %d", numSyms)
|
|
}
|
|
|
|
// Read information about the trees and tree selectors.
|
|
var mtf internal.MoveToFront
|
|
numTrees := int(zr.rd.ReadBitsBE64(3))
|
|
if numTrees < minNumTrees || numTrees > maxNumTrees {
|
|
panicf(errors.Corrupted, "invalid number of prefix trees: %d", numTrees)
|
|
}
|
|
numSels := int(zr.rd.ReadBitsBE64(15))
|
|
if cap(zr.treeSels) < numSels {
|
|
zr.treeSels = make([]uint8, numSels)
|
|
}
|
|
treeSels := zr.treeSels[:numSels]
|
|
for i := range treeSels {
|
|
sym, ok := zr.rd.TryReadSymbol(&decSel)
|
|
if !ok {
|
|
sym = zr.rd.ReadSymbol(&decSel)
|
|
}
|
|
if int(sym) >= numTrees {
|
|
panicf(errors.Corrupted, "invalid prefix tree selector: %d", sym)
|
|
}
|
|
treeSels[i] = uint8(sym)
|
|
}
|
|
mtf.Decode(treeSels)
|
|
zr.treeSels = treeSels
|
|
|
|
// Initialize prefix codes.
|
|
for i := range zr.codes2D[:numTrees] {
|
|
zr.codes1D[i] = zr.codes2D[i][:numSyms]
|
|
}
|
|
zr.rd.ReadPrefixCodes(zr.codes1D[:numTrees], zr.trees1D[:numTrees])
|
|
|
|
// Read prefix encoded symbols of compressed data.
|
|
var tree *prefix.Decoder
|
|
var blkLen, selIdx int
|
|
syms = zr.syms[:0]
|
|
for {
|
|
if blkLen == 0 {
|
|
blkLen = numBlockSyms
|
|
if selIdx >= len(treeSels) {
|
|
panicf(errors.Corrupted, "not enough prefix tree selectors")
|
|
}
|
|
tree = &zr.trees1D[treeSels[selIdx]]
|
|
selIdx++
|
|
}
|
|
blkLen--
|
|
sym, ok := zr.rd.TryReadSymbol(tree)
|
|
if !ok {
|
|
sym = zr.rd.ReadSymbol(tree)
|
|
}
|
|
|
|
if int(sym) == numSyms-1 {
|
|
break // EOF marker
|
|
}
|
|
if int(sym) >= numSyms {
|
|
panicf(errors.Corrupted, "invalid prefix symbol: %d", sym)
|
|
}
|
|
if len(syms) >= zr.level*blockSize {
|
|
panicf(errors.Corrupted, "number of prefix symbols exceeds block size")
|
|
}
|
|
syms = append(syms, uint16(sym))
|
|
}
|
|
zr.syms = syms
|
|
return syms
|
|
}
|
|
|