|
|
|
@ -3,6 +3,7 @@ |
|
|
|
|
package roaring |
|
|
|
|
|
|
|
|
|
import ( |
|
|
|
|
"encoding/binary" |
|
|
|
|
"errors" |
|
|
|
|
"io" |
|
|
|
|
"reflect" |
|
|
|
@ -132,3 +133,284 @@ func byteSliceAsInterval16Slice(slice []byte) (result []interval16) { |
|
|
|
|
// return result
|
|
|
|
|
return |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// FromBuffer creates a bitmap from its serialized version stored in buffer.
|
|
|
|
|
// It uses CRoaring's frozen bitmap format.
|
|
|
|
|
//
|
|
|
|
|
// The format specification is available here:
|
|
|
|
|
// https://github.com/RoaringBitmap/CRoaring/blob/2c867e9f9c9e2a3a7032791f94c4c7ae3013f6e0/src/roaring.c#L2756-L2783
|
|
|
|
|
//
|
|
|
|
|
// The provided byte array (buf) is expected to be a constant.
|
|
|
|
|
// The function makes the best effort attempt not to copy data.
|
|
|
|
|
// Only little endian is supported. The function will err if it detects a big
|
|
|
|
|
// endian serialized file.
|
|
|
|
|
// You should take care not to modify buff as it will likely result in
|
|
|
|
|
// unexpected program behavior.
|
|
|
|
|
// If said buffer comes from a memory map, it's advisable to give it read
|
|
|
|
|
// only permissions, either at creation or by calling Mprotect from the
|
|
|
|
|
// golang.org/x/sys/unix package.
|
|
|
|
|
//
|
|
|
|
|
// Resulting bitmaps are effectively immutable in the following sense:
|
|
|
|
|
// a copy-on-write marker is used so that when you modify the resulting
|
|
|
|
|
// bitmap, copies of selected data (containers) are made.
|
|
|
|
|
// You should *not* change the copy-on-write status of the resulting
|
|
|
|
|
// bitmaps (SetCopyOnWrite).
|
|
|
|
|
//
|
|
|
|
|
// If buf becomes unavailable, then a bitmap created with
|
|
|
|
|
// FromBuffer would be effectively broken. Furthermore, any
|
|
|
|
|
// bitmap derived from this bitmap (e.g., via Or, And) might
|
|
|
|
|
// also be broken. Thus, before making buf unavailable, you should
|
|
|
|
|
// call CloneCopyOnWriteContainers on all such bitmaps.
|
|
|
|
|
//
|
|
|
|
|
func (rb *Bitmap) FrozenView(buf []byte) error { |
|
|
|
|
return rb.highlowcontainer.frozenView(buf) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/* Verbatim specification from CRoaring. |
|
|
|
|
* |
|
|
|
|
* FROZEN SERIALIZATION FORMAT DESCRIPTION |
|
|
|
|
* |
|
|
|
|
* -- (beginning must be aligned by 32 bytes) -- |
|
|
|
|
* <bitset_data> uint64_t[BITSET_CONTAINER_SIZE_IN_WORDS * num_bitset_containers] |
|
|
|
|
* <run_data> rle16_t[total number of rle elements in all run containers] |
|
|
|
|
* <array_data> uint16_t[total number of array elements in all array containers] |
|
|
|
|
* <keys> uint16_t[num_containers] |
|
|
|
|
* <counts> uint16_t[num_containers] |
|
|
|
|
* <typecodes> uint8_t[num_containers] |
|
|
|
|
* <header> uint32_t |
|
|
|
|
* |
|
|
|
|
* <header> is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits) |
|
|
|
|
* and the number of containers (17 bits). |
|
|
|
|
* |
|
|
|
|
* <counts> stores number of elements for every container. |
|
|
|
|
* Its meaning depends on container type. |
|
|
|
|
* For array and bitset containers, this value is the container cardinality minus one. |
|
|
|
|
* For run container, it is the number of rle_t elements (n_runs). |
|
|
|
|
* |
|
|
|
|
* <bitset_data>,<array_data>,<run_data> are flat arrays of elements of |
|
|
|
|
* all containers of respective type. |
|
|
|
|
* |
|
|
|
|
* <*_data> and <keys> are kept close together because they are not accessed |
|
|
|
|
* during deserilization. This may reduce IO in case of large mmaped bitmaps. |
|
|
|
|
* All members have their native alignments during deserilization except <header>, |
|
|
|
|
* which is not guaranteed to be aligned by 4 bytes. |
|
|
|
|
*/ |
|
|
|
|
const FROZEN_COOKIE = 13766 |
|
|
|
|
|
|
|
|
|
var ( |
|
|
|
|
FrozenBitmapInvalidCookie = errors.New("header does not contain the FROZEN_COOKIE") |
|
|
|
|
FrozenBitmapBigEndian = errors.New("loading big endian frozen bitmaps is not supported") |
|
|
|
|
FrozenBitmapIncomplete = errors.New("input buffer too small to contain a frozen bitmap") |
|
|
|
|
FrozenBitmapOverpopulated = errors.New("too many containers") |
|
|
|
|
FrozenBitmapUnexpectedData = errors.New("spurious data in input") |
|
|
|
|
FrozenBitmapInvalidTypecode = errors.New("unrecognized typecode") |
|
|
|
|
FrozenBitmapBufferTooSmall = errors.New("buffer too small") |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
func (ra *roaringArray) frozenView(buf []byte) error { |
|
|
|
|
if len(buf) < 4 { |
|
|
|
|
return FrozenBitmapIncomplete |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
headerBE := binary.BigEndian.Uint32(buf[len(buf)-4:]) |
|
|
|
|
if headerBE & 0x7fff == FROZEN_COOKIE { |
|
|
|
|
return FrozenBitmapBigEndian |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
header := binary.LittleEndian.Uint32(buf[len(buf)-4:]) |
|
|
|
|
buf = buf[:len(buf)-4] |
|
|
|
|
|
|
|
|
|
if header & 0x7fff != FROZEN_COOKIE { |
|
|
|
|
return FrozenBitmapInvalidCookie |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
nCont := int(header >> 15) |
|
|
|
|
if nCont > (1 << 16) { |
|
|
|
|
return FrozenBitmapOverpopulated |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 1 byte per type, 2 bytes per key, 2 bytes per count.
|
|
|
|
|
if len(buf) < 5*nCont { |
|
|
|
|
return FrozenBitmapIncomplete |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
types := buf[len(buf)-nCont:] |
|
|
|
|
buf = buf[:len(buf)-nCont] |
|
|
|
|
|
|
|
|
|
counts := byteSliceAsUint16Slice(buf[len(buf)-2*nCont:]) |
|
|
|
|
buf = buf[:len(buf)-2*nCont] |
|
|
|
|
|
|
|
|
|
keys := byteSliceAsUint16Slice(buf[len(buf)-2*nCont:]) |
|
|
|
|
buf = buf[:len(buf)-2*nCont] |
|
|
|
|
|
|
|
|
|
nBitmap, nArray, nRun := uint64(0), uint64(0), uint64(0) |
|
|
|
|
nArrayEl, nRunEl := uint64(0), uint64(0) |
|
|
|
|
for i, t := range types { |
|
|
|
|
switch (t) { |
|
|
|
|
case 1: |
|
|
|
|
nBitmap++ |
|
|
|
|
case 2: |
|
|
|
|
nArray++ |
|
|
|
|
nArrayEl += uint64(counts[i])+1 |
|
|
|
|
case 3: |
|
|
|
|
nRun++ |
|
|
|
|
nRunEl += uint64(counts[i]) |
|
|
|
|
default: |
|
|
|
|
return FrozenBitmapInvalidTypecode |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if uint64(len(buf)) < (1 << 13)*nBitmap + 4*nRunEl + 2*nArrayEl { |
|
|
|
|
return FrozenBitmapIncomplete |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bitsetsArena := byteSliceAsUint64Slice(buf[:(1 << 13)*nBitmap]) |
|
|
|
|
buf = buf[(1 << 13)*nBitmap:] |
|
|
|
|
|
|
|
|
|
runsArena := byteSliceAsInterval16Slice(buf[:4*nRunEl]) |
|
|
|
|
buf = buf[4*nRunEl:] |
|
|
|
|
|
|
|
|
|
arraysArena := byteSliceAsUint16Slice(buf[:2*nArrayEl]) |
|
|
|
|
buf = buf[2*nArrayEl:] |
|
|
|
|
|
|
|
|
|
if len(buf) != 0 { |
|
|
|
|
return FrozenBitmapUnexpectedData |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TODO: maybe arena_alloc all this.
|
|
|
|
|
containers := make([]container, nCont) |
|
|
|
|
bitsets := make([]bitmapContainer, nBitmap) |
|
|
|
|
arrays := make([]arrayContainer, nArray) |
|
|
|
|
runs := make([]runContainer16, nRun) |
|
|
|
|
needCOW := make([]bool, nCont) |
|
|
|
|
|
|
|
|
|
iBitset, iArray, iRun := uint64(0), uint64(0), uint64(0) |
|
|
|
|
for i, t := range types { |
|
|
|
|
needCOW[i] = true |
|
|
|
|
|
|
|
|
|
switch (t) { |
|
|
|
|
case 1: |
|
|
|
|
containers[i] = &bitsets[iBitset] |
|
|
|
|
bitsets[iBitset].cardinality = int(counts[i])+1 |
|
|
|
|
bitsets[iBitset].bitmap = bitsetsArena[:1024] |
|
|
|
|
bitsetsArena = bitsetsArena[1024:] |
|
|
|
|
iBitset++ |
|
|
|
|
case 2: |
|
|
|
|
containers[i] = &arrays[iArray] |
|
|
|
|
arrays[iArray].content = arraysArena[:counts[i]+1] |
|
|
|
|
arraysArena = arraysArena[counts[i]+1:] |
|
|
|
|
iArray++ |
|
|
|
|
case 3: |
|
|
|
|
containers[i] = &runs[iRun] |
|
|
|
|
runs[iRun].iv = runsArena[:counts[i]] |
|
|
|
|
runsArena = runsArena[counts[i]:] |
|
|
|
|
iRun++ |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Not consuming the full input is a bug.
|
|
|
|
|
if iBitset != nBitmap || len(bitsetsArena) != 0 || |
|
|
|
|
iArray != nArray || len(arraysArena) != 0 || |
|
|
|
|
iRun != nRun || len(runsArena) != 0 { |
|
|
|
|
panic("we missed something") |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ra.keys = keys |
|
|
|
|
ra.containers = containers |
|
|
|
|
ra.needCopyOnWrite = needCOW |
|
|
|
|
ra.copyOnWrite = true |
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (bm *Bitmap) GetFrozenSizeInBytes() uint64 { |
|
|
|
|
nBits, nArrayEl, nRunEl := uint64(0), uint64(0), uint64(0) |
|
|
|
|
for _, c := range bm.highlowcontainer.containers { |
|
|
|
|
switch v := c.(type) { |
|
|
|
|
case *bitmapContainer: |
|
|
|
|
nBits++ |
|
|
|
|
case *arrayContainer: |
|
|
|
|
nArrayEl += uint64(len(v.content)) |
|
|
|
|
case *runContainer16: |
|
|
|
|
nRunEl += uint64(len(v.iv)) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return 4 + 5*uint64(len(bm.highlowcontainer.containers)) + |
|
|
|
|
(nBits << 13) + 2*nArrayEl + 4*nRunEl |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (bm *Bitmap) Freeze() ([]byte, error) { |
|
|
|
|
sz := bm.GetFrozenSizeInBytes() |
|
|
|
|
buf := make([]byte, sz) |
|
|
|
|
_, err := bm.FreezeTo(buf) |
|
|
|
|
return buf, err |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (bm *Bitmap) FreezeTo(buf []byte) (int, error) { |
|
|
|
|
containers := bm.highlowcontainer.containers |
|
|
|
|
nCont := len(containers) |
|
|
|
|
|
|
|
|
|
nBits, nArrayEl, nRunEl := 0, 0, 0 |
|
|
|
|
for _, c := range containers { |
|
|
|
|
switch v := c.(type) { |
|
|
|
|
case *bitmapContainer: |
|
|
|
|
nBits++ |
|
|
|
|
case *arrayContainer: |
|
|
|
|
nArrayEl += len(v.content) |
|
|
|
|
case *runContainer16: |
|
|
|
|
nRunEl += len(v.iv) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
serialSize := 4 + 5*nCont + (1 << 13)*nBits + 4*nRunEl + 2*nArrayEl |
|
|
|
|
if len(buf) < serialSize { |
|
|
|
|
return 0, FrozenBitmapBufferTooSmall |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bitsArena := byteSliceAsUint64Slice(buf[:(1 << 13)*nBits]) |
|
|
|
|
buf = buf[(1 << 13)*nBits:] |
|
|
|
|
|
|
|
|
|
runsArena := byteSliceAsInterval16Slice(buf[:4*nRunEl]) |
|
|
|
|
buf = buf[4*nRunEl:] |
|
|
|
|
|
|
|
|
|
arraysArena := byteSliceAsUint16Slice(buf[:2*nArrayEl]) |
|
|
|
|
buf = buf[2*nArrayEl:] |
|
|
|
|
|
|
|
|
|
keys := byteSliceAsUint16Slice(buf[:2*nCont]) |
|
|
|
|
buf = buf[2*nCont:] |
|
|
|
|
|
|
|
|
|
counts := byteSliceAsUint16Slice(buf[:2*nCont]) |
|
|
|
|
buf = buf[2*nCont:] |
|
|
|
|
|
|
|
|
|
types := buf[:nCont] |
|
|
|
|
buf = buf[nCont:] |
|
|
|
|
|
|
|
|
|
header := uint32(FROZEN_COOKIE|(nCont << 15)) |
|
|
|
|
binary.LittleEndian.PutUint32(buf[:4], header) |
|
|
|
|
|
|
|
|
|
copy(keys, bm.highlowcontainer.keys[:]) |
|
|
|
|
|
|
|
|
|
for i, c := range containers { |
|
|
|
|
switch v := c.(type) { |
|
|
|
|
case *bitmapContainer: |
|
|
|
|
copy(bitsArena, v.bitmap) |
|
|
|
|
bitsArena = bitsArena[1024:] |
|
|
|
|
counts[i] = uint16(v.cardinality-1) |
|
|
|
|
types[i] = 1 |
|
|
|
|
case *arrayContainer: |
|
|
|
|
copy(arraysArena, v.content) |
|
|
|
|
arraysArena = arraysArena[len(v.content):] |
|
|
|
|
elems := len(v.content) |
|
|
|
|
counts[i] = uint16(elems)-1 |
|
|
|
|
types[i] = 2 |
|
|
|
|
case *runContainer16: |
|
|
|
|
copy(runsArena, v.iv) |
|
|
|
|
runs := len(v.iv) |
|
|
|
|
runsArena = runsArena[runs:] |
|
|
|
|
counts[i] = uint16(runs) |
|
|
|
|
types[i] = 3 |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return serialSize, nil |
|
|
|
|
} |
|
|
|
|