+ * This is useful to pre-allocate the output buffer before calling + * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper)}. + *
+ * + * @param compressedPositions + * since not all schemes compress every input integer, this parameter + * returns how many input integers will actually be compressed. + * This is useful when composing multiple schemes. + * @param inlength + * number of integers to be compressed + * @return the maximum number of integers needed in the output array + */ + int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength); } diff --git a/src/main/java/me/lemire/integercompression/UncompressibleInputException.java b/src/main/java/me/lemire/integercompression/UncompressibleInputException.java deleted file mode 100644 index c490946..0000000 --- a/src/main/java/me/lemire/integercompression/UncompressibleInputException.java +++ /dev/null @@ -1,19 +0,0 @@ -package me.lemire.integercompression; - -/** - * This exception might be thrown if the input is poorly compressible. - * - */ -public class UncompressibleInputException extends RuntimeException { - - /** - * Create new exception - * @param string explanation for the exception - */ - public UncompressibleInputException(String string) { - super(string); - } - - private static final long serialVersionUID = -798583799846489873L; - -} diff --git a/src/main/java/me/lemire/integercompression/Util.java b/src/main/java/me/lemire/integercompression/Util.java index 346e3b2..63fc918 100644 --- a/src/main/java/me/lemire/integercompression/Util.java +++ b/src/main/java/me/lemire/integercompression/Util.java @@ -15,13 +15,13 @@ public final class Util { - - // check whether x is small than y as unsigned ints (supported by Java 8 natively); - protected static final boolean smallerorequalthan(int x, int y) { - return (x + Integer.MIN_VALUE) <= (y + Integer.MIN_VALUE); - } - - /** + + // check whether x is small than y as unsigned ints (supported by Java 8 natively); + protected static final boolean smallerorequalthan(int x, int y) { + return (x + Integer.MIN_VALUE) <= (y + Integer.MIN_VALUE); + } + + /** * Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range * of value * diff --git a/src/main/java/me/lemire/integercompression/VariableByte.java b/src/main/java/me/lemire/integercompression/VariableByte.java index 5b25c43..c9b04d0 100644 --- a/src/main/java/me/lemire/integercompression/VariableByte.java +++ b/src/main/java/me/lemire/integercompression/VariableByte.java @@ -21,6 +21,8 @@ */ public class VariableByte implements IntegerCODEC, ByteIntegerCODEC, SkippableIntegerCODEC { + private static final int MAX_BYTES_PER_INT = 5; + private static byte extract7bits(int i, long val) { return (byte) ((val >> (7 * i)) & ((1 << 7) - 1)); } @@ -122,8 +124,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, for (int v = 0, shift = 0; p < finalp;) { val = in[p]; int c = (byte) (val >>> s); + // Shift to next byte s += 8; + // Shift to next integer if s==32 p += s>>5; + // cycle from 31 to 0 s = s & 31; v += ((c & 127) << shift); if ((c & 128) == 128) { @@ -187,8 +192,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) { val = in[p]; int c = val >>> s; + // Shift to next byte s += 8; + // Shift to next integer if s==32 p += s>>5; + // cycle from 31 to 0 s = s & 31; v += ((c & 127) << shift); if ((c & 128) == 128) { @@ -202,12 +210,23 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o inpos.set(p + (s!=0 ? 1 : 0)); } + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int maxLengthInBytes = inlength * MAX_BYTES_PER_INT; + int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES; + compressedPositions.add(inlength); + return maxLengthInInts; + } + /** * Creates a new buffer of the requested size. * * In case you need a different way to allocate buffers, you can override this method * with a custom behavior. The default implementation allocates a new Java direct * {@link ByteBuffer} on each invocation. + * + * @param sizeInBytes + * @return */ protected ByteBuffer makeBuffer(int sizeInBytes) { return ByteBuffer.allocateDirect(sizeInBytes); diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java index c5fee69..ef4a386 100644 --- a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java +++ b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java @@ -308,10 +308,10 @@ private static void testByteCodec(PrintWriter csvLog, int sparsity, public static void main(String args[]) throws FileNotFoundException { System.out .println("# benchmark based on the ClusterData model from:"); - System.out.println("# Vo Ngoc Anh and Alistair Moffat. "); - System.out.println("# Index compression using 64-bit words."); + System.out.println("# Vo Ngoc Anh and Alistair Moffat. "); + System.out.println("# Index compression using 64-bit words."); System.out - .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. "); + .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. "); System.out.println(); PrintWriter writer = null; diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java index 58bbc4a..b930568 100644 --- a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java +++ b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java @@ -83,7 +83,6 @@ private static int decompressFromSkipTable(Object c, int[] compressed, if (num > length - uncomppos.get()) num = length - uncomppos.get(); int location = metadata[metapos++]; - // System.out.println("location = "+location); int initvalue = metadata[metapos++]; int outputlocation = uncomppos.get(); if (location != compressedpos.get()) @@ -242,10 +241,10 @@ private static void testCodec(PrintWriter csvLog, int sparsity, Object c, */ public static void main(String args[]) throws FileNotFoundException { System.out.println("# benchmark based on the ClusterData model from:"); - System.out.println("# Vo Ngoc Anh and Alistair Moffat. "); - System.out.println("# Index compression using 64-bit words."); + System.out.println("# Vo Ngoc Anh and Alistair Moffat. "); + System.out.println("# Index compression using 64-bit words."); System.out - .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. "); + .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. "); System.out.println(); PrintWriter writer = null; diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java index 7e1c161..f50a367 100644 --- a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java +++ b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java @@ -49,7 +49,8 @@ public class IntegratedBinaryPacking implements IntegratedIntegerCODEC, SkippableIntegratedIntegerCODEC { - static final int BLOCK_SIZE = 32; + public static final int BLOCK_SIZE = 32; + private static final int MAX_BIT_WIDTH = Integer.SIZE; @Override public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, @@ -170,4 +171,13 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, initvalue.set(initoffset); inpos.set(tmpinpos); } + + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int blockCount = inlength / BLOCK_SIZE; + int headersSizeInInts = blockCount / Integer.BYTES + (blockCount % Integer.BYTES); + int blocksSizeInInts = blockCount * MAX_BIT_WIDTH; + compressedPositions.add(blockCount * BLOCK_SIZE); + return headersSizeInInts + blocksSizeInInts; + } } diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java index 5808bdd..1d935c4 100644 --- a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java +++ b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java @@ -3,7 +3,6 @@ import java.util.Arrays; import me.lemire.integercompression.IntWrapper; -import me.lemire.integercompression.UncompressibleInputException; /** * This is a convenience class that wraps a codec to provide @@ -36,19 +35,14 @@ public IntegratedIntCompressor() { * * @param input array to be compressed * @return compressed array - * @throws UncompressibleInputException if the data is too poorly compressible */ public int[] compress(int[] input) { - int [] compressed = new int[input.length + input.length / 100 + 1024]; + int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length); + int [] compressed = new int[maxCompressedLength + 1]; // +1 to store the length of the input compressed[0] = input.length; IntWrapper outpos = new IntWrapper(1); IntWrapper initvalue = new IntWrapper(0); - try { - codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos, initvalue); - } catch (IndexOutOfBoundsException ioebe) { - throw new UncompressibleInputException( - "Your input is too poorly compressible with the current codec : " + codec); - } + codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos, initvalue); compressed = Arrays.copyOf(compressed,outpos.intValue()); return compressed; } diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java index 918a900..a577031 100644 --- a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java +++ b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java @@ -24,6 +24,8 @@ public class IntegratedVariableByte implements IntegratedIntegerCODEC, IntegratedByteIntegerCODEC, SkippableIntegratedIntegerCODEC { + private static final int MAX_BYTES_PER_INT = 5; + private static byte extract7bits(int i, long val) { return (byte)((val >> (7 * i)) & ((1 << 7) - 1)); } @@ -257,6 +259,14 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, inpos.set(p + (s!=0 ? 1 : 0)); } + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int maxLengthInBytes = inlength * MAX_BYTES_PER_INT; + int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES; + compressedPositions.add(inlength); + return maxLengthInInts; + } + /** * Creates a new buffer of the requested size. * diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java index 09c4dd8..4786ec5 100644 --- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java +++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java @@ -66,14 +66,25 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, if (inlength == 0) return; int init = inpos.get(); + int outposInit = outpos.get(); + F1.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue); if (inpos.get() == init) { - inpos.increment(); + inpos.increment(); } inlength -= inpos.get() - init; - num -= outpos.get(); + num -= outpos.get() - outposInit; F2.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue); } + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int init = compressedPositions.get(); + int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength); + maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version? + inlength -= compressedPositions.get() - init; + maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength); + return maxLength; + } } diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java index 8b7fd4b..e2df754 100644 --- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java +++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java @@ -71,4 +71,21 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num, IntWrapper initvalue); + /** + * Compute the maximum number of integers that might be required to store + * the compressed form of a given input array segment, without headers. + *+ * This is useful to pre-allocate the output buffer before calling + * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper, IntWrapper)}. + *
+ * + * @param compressedPositions + * since not all schemes compress every input integer, this parameter + * returns how many input integers will actually be compressed. + * This is useful when composing multiple schemes. + * @param inlength + * number of integers to be compressed + * @return the maximum number of integers needed in the output array + */ + int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength); } diff --git a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java index bbd386a..a50497c 100644 --- a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java +++ b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java @@ -42,7 +42,7 @@ int[] generateUniformHash(int N, int Max) { int[] ans = new int[N]; HashSet+ * For details, please see: + *
+ * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second + * through vectorization Software: Practice & Experience + * http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract + * http://arxiv.org/abs/1209.2137 + *
+ *For sufficiently compressible and long arrays, it is faster and better + * than other PFOR schemes.
+ * + * Note that this does not use differential coding: if you are working on sorted + * lists, you should first compute deltas, @see + * me.lemire.integercompression.differential.Delta#delta. + * + * For multi-threaded applications, each thread should use its own FastPFOR + * object. + * + * @author Daniel Lemire + */ +public class VectorFastPFOR implements IntegerCODEC, SkippableIntegerCODEC { + private final static int OVERHEAD_OF_EACH_EXCEPT = 8; + public final static int DEFAULT_PAGE_SIZE = 64 << 10; + + public final static int BLOCK_SIZE = 256; + private final static int INTS_PER_BLOCK = BLOCK_SIZE >>> 5; + + private final int pageSize; + private final int[][] dataTobePacked = new int[33][]; + private int[] exceptData = null; + + // Working area for compress and uncompress. + private final int[] dataPointers = new int[33]; + private final int[] freqs = new int[33]; + private final byte[] bem; + /** + * Construct the FastPFOR CODEC. + * + * @param pagesize + * the desired page size (recommended value is + * FastPFOR.DEFAULT_PAGE_SIZE) + */ + private VectorFastPFOR(int pagesize) { + pageSize = pagesize; + // Initiate arrrays. + bem = new byte[3 * pageSize / BLOCK_SIZE + pagesize]; + for (int k = 1; k < dataTobePacked.length; ++k) + dataTobePacked[k] = new int[pageSize / 32 * 4]; // heuristic + exceptData = new int[pageSize * 4]; + } + + /** + * Construct the fastPFOR CODEC with default parameters. + */ + public VectorFastPFOR() { this(DEFAULT_PAGE_SIZE); } + + /** + * Compress data in blocks of BLOCK_SIZE integers (if fewer than BLOCK_SIZE + * integers are provided, nothing is done). + * + * @see IntegerCODEC#compress(int[], IntWrapper, int, int[], IntWrapper) + */ + @Override + public void headlessCompress(int[] in, IntWrapper inpos, int inlength, + int[] out, IntWrapper outpos) { + inlength = inlength - inlength % BLOCK_SIZE; + // Allocate memory for working area. + + final int finalinpos = inpos.get() + inlength; + while (inpos.get() != finalinpos) { + int thissize = Math.min(pageSize, finalinpos - inpos.get()); + encodePage(in, inpos, thissize, out, outpos); + } + } + + private void getBestBitSize(int[] in, int pos, int index) { + Arrays.fill(freqs, 0); + for (int i = pos, limit = pos + BLOCK_SIZE; i < limit; i++) { + freqs[32 - Integer.numberOfLeadingZeros(in[i])]++; + } + bem[index] = 32; + while (freqs[bem[index]] == 0) + bem[index]--; + bem[index + 2] = bem[index]; + int maxb = bem[index + 2]; + int bestcost = bem[index] * BLOCK_SIZE; + int cexcept = 0; + bem[index + 1] = 0; + for (int b = bem[index] - 1; b >= 0; --b) { + cexcept += freqs[b + 1]; + if (cexcept == BLOCK_SIZE) + break; + // the extra 8 is the cost of storing maxbits + int thiscost = cexcept * OVERHEAD_OF_EACH_EXCEPT + cexcept * (maxb - b) + + b * BLOCK_SIZE + 8; + if (maxb - b == 1) + thiscost -= cexcept; + if (thiscost < bestcost) { + bestcost = thiscost; + bem[index] = (byte)b; + bem[index + 1] = (byte)cexcept; + } + } + } + + private void encodePage(int[] in, IntWrapper inpos, int thissize, int[] out, + IntWrapper outpos) { + final int headerpos = outpos.get(); + outpos.increment(); + int tmpoutpos = outpos.get(); + + // Clear working area. + Arrays.fill(dataPointers, 0); + Arrays.fill(bem, (byte)0); + + int tmpinpos = inpos.get(); + final int finalinpos = tmpinpos + thissize - BLOCK_SIZE; + int bindex = 0; + for (; tmpinpos <= finalinpos; tmpinpos += BLOCK_SIZE) { + getBestBitSize(in, tmpinpos, bindex); + final int tmpexcept = bem[bindex + 1] & 0xFF; + final int tmpbestb = bem[bindex]; + if (tmpexcept > 0) { + final int index = bem[bindex + 2] - tmpbestb; + if (dataPointers[index] + tmpexcept >= dataTobePacked[index].length) { + int newsize = 2 * (dataPointers[index] + tmpexcept); + int val = newsize + BLOCK_SIZE - 1; + newsize = val - val % BLOCK_SIZE; + dataTobePacked[index] = Arrays.copyOf(dataTobePacked[index], newsize); + } + bindex += 3; + for (int k = 0; k < BLOCK_SIZE; ++k) { + if ((in[k + tmpinpos] >>> tmpbestb) != 0) { + // we have an exception + bem[bindex++] = (byte)k; + dataTobePacked[index][dataPointers[index]++] = + in[k + tmpinpos] >>> tmpbestb; + } + } + } else { + bindex += 2; + } + VectorBitPacker.fastpack(in, tmpinpos, out, tmpoutpos, tmpbestb); + tmpoutpos += INTS_PER_BLOCK * tmpbestb; + } + inpos.set(tmpinpos); + out[headerpos] = tmpoutpos - headerpos; + + int bytesize = bindex; + out[tmpoutpos++] = bytesize; + + bytesize = bytesize % 4 == 0 ? bytesize : (bytesize / 4) * 4 + 4; + for (int i = 0; i <= bytesize - 4; i += 4) { + out[tmpoutpos] = bem[i] & 0xFF; + out[tmpoutpos] |= (bem[i + 1] & 0xFF) << 8; + out[tmpoutpos] |= (bem[i + 2] & 0xFF) << 16; + out[tmpoutpos] |= (bem[i + 3] & 0xFF) << 24; + tmpoutpos++; + } + + int bitmap = 0; + for (int k = 2; k <= 32; ++k) { + if (dataPointers[k] != 0) + bitmap |= (1 << (k - 1)); + } + out[tmpoutpos++] = bitmap; + + for (int k = 2; k <= 32; ++k) { + if (dataPointers[k] != 0) { + out[tmpoutpos++] = dataPointers[k]; // size + int j = 0; + int n = (dataPointers[k] / BLOCK_SIZE) * BLOCK_SIZE; + for (; j < n; j += BLOCK_SIZE) { + VectorBitPacker.fastpackNoMask(dataTobePacked[k], j, out, tmpoutpos, + k); + tmpoutpos += INTS_PER_BLOCK * k; + } + int r = dataPointers[k] % BLOCK_SIZE; + if (r != 0) { + tmpoutpos = VectorBitPacker.slowpack(dataTobePacked[k], j, r, out, + tmpoutpos, k); + tmpoutpos++; + } + } + } + outpos.set(tmpoutpos); + } + + /** + * Uncompress data in blocks of integers. In this particular case, + * the inlength parameter is ignored: it is deduced from the compressed + * data. + * + * @see IntegerCODEC#compress(int[], IntWrapper, int, int[], IntWrapper) + */ + @Override + public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, + int[] out, IntWrapper outpos, int mynvalue) { + mynvalue = mynvalue - mynvalue % BLOCK_SIZE; + int finalout = outpos.get() + mynvalue; + while (outpos.get() != finalout) { + int thissize = Math.min(pageSize, finalout - outpos.get()); + decodePage(in, inpos, out, outpos, thissize); + } + } + + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + throw new UnsupportedOperationException("Calculating the max compressed length is not supported yet."); + } + + private void loadMetaData(int[] in, int inexcept, int bytesize) { + // Arrays.fill(bem, (byte)0); + int len = (bytesize + 3) / 4; + int lc = 0; + for (int i = 0; i < len; i++) { + bem[lc++] = (byte)(in[inexcept + i]); + bem[lc++] = (byte)(in[inexcept + i] >>> 8); + bem[lc++] = (byte)(in[inexcept + i] >>> 16); + bem[lc++] = (byte)(in[inexcept + i] >>> 24); + } + } + + private void decodePage(int[] in, IntWrapper inpos, int[] out, + IntWrapper outpos, int thissize) { + final int initpos = inpos.get(); + final int wheremeta = in[inpos.get()]; + inpos.increment(); + int inexcept = initpos + wheremeta; + + final int bytesize = in[inexcept++]; + loadMetaData(in, inexcept, bytesize); + inexcept += (bytesize + 3) / 4; + final int bitmap = in[inexcept++]; + for (int k = 2; k <= 32; ++k) { + if ((bitmap & (1 << (k - 1))) != 0) { + int size = in[inexcept++]; + int val = size + BLOCK_SIZE - 1; + int roundedup = val - val % BLOCK_SIZE; + if (dataTobePacked[k].length < roundedup) + dataTobePacked[k] = new int[roundedup]; + if (inexcept + roundedup / 32 * k <= in.length) { + int j = 0; + int len = (size / BLOCK_SIZE) * BLOCK_SIZE; + for (; j < len; j += BLOCK_SIZE) { + VectorBitPacker.fastunpack(in, inexcept, dataTobePacked[k], j, k); + inexcept += INTS_PER_BLOCK * k; + } + int r = size % BLOCK_SIZE; + inexcept = VectorBitPacker.slowunpack(in, inexcept, dataTobePacked[k], + j, r, k); + } else { + int j = 0; + val = roundedup / 32 * k + BLOCK_SIZE - 1; + int[] buf = new int[val - val % BLOCK_SIZE]; + int initinexcept = inexcept; + System.arraycopy(in, inexcept, buf, 0, in.length - inexcept); + int l = (size / BLOCK_SIZE) * BLOCK_SIZE; + for (; j < l; j += BLOCK_SIZE) { + VectorBitPacker.fastunpack(buf, inexcept - initinexcept, + dataTobePacked[k], j, k); + inexcept += INTS_PER_BLOCK * k; + } + int r = size % BLOCK_SIZE; + inexcept = VectorBitPacker.slowunpack(in, inexcept, dataTobePacked[k], + j, r, k); + } + } + } + Arrays.fill(dataPointers, 0); + int tmpoutpos = outpos.get(); + int tmpinpos = inpos.get(); + int idx = 0; + for (int run = 0, run_end = thissize / BLOCK_SIZE; run < run_end; + ++run, tmpoutpos += BLOCK_SIZE) { + final int b = bem[idx]; // byteContainer.get(); + final int cexcept = bem[idx + 1] & 0xFF; // byteContainer.get() & 0xFF; + VectorBitPacker.fastunpack(in, tmpinpos, out, tmpoutpos, b); + tmpinpos += INTS_PER_BLOCK * b; + if (cexcept > 0) { + final int maxbits = bem[idx + 2]; // byteContainer.get(); + idx += 3; + final int index = maxbits - b; + if (index == 1) { + for (int k = 0; k < cexcept; ++k) { + final int pos = bem[idx++] & 0xFF; // byteContainer.get() & 0xFF; + out[pos + tmpoutpos] |= 1 << b; + } + } else { + for (int k = 0; k < cexcept; ++k) { + final int pos = bem[idx++] & 0xFF; // byteContainer.get() & 0xFF; + final int exceptvalue = + dataTobePacked[index][dataPointers[index]++]; + out[pos + tmpoutpos] |= exceptvalue << b; + } + } + } else { + idx += 2; + } + } + outpos.set(tmpoutpos); + inpos.set(inexcept); + } + + @Override + public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, + IntWrapper outpos) { + inlength = inlength - inlength % BLOCK_SIZE; + if (inlength == 0) + return; + out[outpos.get()] = inlength; + outpos.increment(); + headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, + IntWrapper outpos) { + if (inlength == 0) + return; + final int outlength = in[inpos.get()]; + inpos.increment(); + headlessUncompress(in, inpos, inlength, out, outpos, outlength); + } + @Override + public String toString() { + return this.getClass().getSimpleName(); + } + + /** + * Creates a new buffer of the requested size. + * + * In case you need a different way to allocate buffers, you can override this + * method with a custom behavior. The default implementation allocates a new + * Java direct + * {@link ByteBuffer} on each invocation. + */ + protected ByteBuffer makeBuffer(int sizeInBytes) { + return ByteBuffer.allocateDirect(sizeInBytes); + } +} diff --git a/src/main/java/me/lemire/longcompression/ByteLongCODEC.java b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java new file mode 100644 index 0000000..dbc6864 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java @@ -0,0 +1,62 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Interface describing a CODEC to compress longs to bytes. + * + * @author Benoit Lacelle + * + */ +public interface ByteLongCODEC { + /** + * Compress data from an array to another array. + * + * Both inpos and outpos are modified to represent how much data was + * read and written to. If 12 longs (inlength = 12) are compressed to 3 + * bytes, then inpos will be incremented by 12 while outpos will be + * incremented by 3. We use IntWrapper to pass the values by reference. + * + * @param in + * input array + * @param inpos + * location in the input array + * @param inlength + * how many longs to compress + * @param out + * output array + * @param outpos + * where to write in the output array + */ + public void compress(long[] in, IntWrapper inpos, int inlength, + byte[] out, IntWrapper outpos); + + /** + * Uncompress data from an array to another array. + * + * Both inpos and outpos parameters are modified to indicate new + * positions after read/write. + * + * @param in + * array containing data in compressed form + * @param inpos + * where to start reading in the array + * @param inlength + * length of the compressed data (ignored by some + * schemes) + * @param out + * array where to write the compressed output + * @param outpos + * where to write the compressed output in out + */ + public void uncompress(byte[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos); + +} diff --git a/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java b/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java new file mode 100644 index 0000000..b21ef68 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java @@ -0,0 +1,11 @@ +package me.lemire.longcompression; + +/** + * This is just like LongCODEC, except that it indicates that delta coding is + * "integrated", so that you don't need a separate step for delta coding. + * + * @author Benoit Lacelle + */ +public interface IntegratedLongCODEC extends LongCODEC { + +} diff --git a/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java new file mode 100644 index 0000000..35c1166 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java @@ -0,0 +1,189 @@ +package me.lemire.longcompression; + +import java.util.Arrays; + +import me.lemire.integercompression.BinaryPacking; +import me.lemire.integercompression.Composition; +import me.lemire.integercompression.IntCompressor; +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.IntegerCODEC; +import me.lemire.integercompression.VariableByte; + +/** + * A {@link LongCODEC} which split each long in a highpart (32 first bits) and a low part (32 last bits). + * + * @author Benoit Lacelle + * + */ +public class LongAs2IntsCodec implements LongCODEC { + final IntegerCODEC highPartsCodec; + final IntegerCODEC lowPartsCodec; + + public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) { + this.highPartsCodec = highPartsCodec; + this.lowPartsCodec = lowPartsCodec; + } + + /** + * By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive + * integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC + */ + public LongAs2IntsCodec() { + this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte())); + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + + int[] highParts = new int[inlength]; + int[] lowParts = new int[inlength]; + + for (int i = 0; i < inlength; i++) { + int inPosition = inpos.get() + i; + + highParts[i] = RoaringIntPacking.high(in[inPosition]); + lowParts[i] = RoaringIntPacking.low(in[inPosition]); + } + + // TODO What would be a relevant buffer size? + int[] buffer = new int[inlength * 16]; + + int outPosition = outpos.get(); + + boolean hasLeftover; + { + // The first integer is reserved to hold the number of compressed ints + IntWrapper highPartsOutPosition = new IntWrapper(1); + + highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition); + + // Record the compressedHighparts length + buffer[0] = highPartsOutPosition.get() - 1; + + for (int i = 0; i < highPartsOutPosition.get() / 2; i++) { + long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); + out[outPosition++] = pack; + } + + if (1 == highPartsOutPosition.get() % 2) { + // Shift the trailing integer as first in the buffer + hasLeftover = true; + buffer[0] = buffer[highPartsOutPosition.get() - 1]; + } else { + hasLeftover = false; + } + } + + { + // The first integer is reserved to hold the number of compressed ints + IntWrapper lowPartsOutPosition = new IntWrapper(1); + if (hasLeftover) { + // Keep the trailing int from highParts before the reserved int from lowParts compressed length + lowPartsOutPosition.set(2); + } + + lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition); + + // Record the compressedHighparts length + buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1); + + for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) { + long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); + out[outPosition++] = pack; + } + + if (1 == lowPartsOutPosition.get() % 2) { + // The trailing integer is packed with a 0 + long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0); + out[outPosition++] = pack; + } + } + + inpos.add(inlength); + outpos.set(outPosition); + } + + /** + * inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length + */ + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + + int longIndex = inpos.get(); + + int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]); + int[] compressedHighParts = new int[nbCompressedHighParts]; + + // !highPart as we just read the highPart for nbCompressedHighParts + boolean highPart = false; + for (int i = 0; i < nbCompressedHighParts; i++) { + int nextInt; + if (highPart) { + nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]); + } else { + nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]); + } + compressedHighParts[i] = nextInt; + + highPart = !highPart; + } + + // TODO What would be a relevant buffer size? + int[] buffer = new int[inlength * 16]; + + IntWrapper highPartsOutPosition = new IntWrapper(); + highPartsCodec.uncompress(compressedHighParts, + new IntWrapper(), + compressedHighParts.length, + buffer, + highPartsOutPosition); + int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get()); + + // +1 as we initially read nbCompressedHighParts + int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts; + int nbCompressedLowParts; + if (highPart) { + nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]); + } else { + nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]); + } + highPart = !highPart; + + int[] compressedLowParts = new int[nbCompressedLowParts]; + for (int i = 0; i < nbCompressedLowParts; i++) { + int nextInt; + if (highPart) { + nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); + } else { + nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); + } + compressedLowParts[i] = nextInt; + + highPart = !highPart; + } + + IntWrapper lowPartsOutPosition = new IntWrapper(); + lowPartsCodec.uncompress(compressedLowParts, + new IntWrapper(), + compressedLowParts.length, + buffer, + lowPartsOutPosition); + int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get()); + assert highParts.length == lowParts.length; + + int outposition = outpos.get(); + for (int i = 0; i < highParts.length; i++) { + out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]); + } + + inpos.add(inlength); + outpos.set(outposition); + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongBinaryPacking.java b/src/main/java/me/lemire/longcompression/LongBinaryPacking.java new file mode 100644 index 0000000..b6ea58f --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongBinaryPacking.java @@ -0,0 +1,153 @@ +package me.lemire.longcompression; + +import me.lemire.integercompression.BinaryPacking; +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.Util; + +/** + * Scheme based on a commonly used idea: can be extremely fast. + * It encodes integers in blocks of 64 longs. For arrays containing + * an arbitrary number of longs, you should use it in conjunction + * with another CODEC: + * + *LongCODEC ic = + * new Composition(new LongBinaryPacking(), new LongVariableByte()).+ * + * Note that this does not use differential coding: if you are working on sorted + * lists, you must compute the deltas separately. + * + *
+ * For details, please see {@link BinaryPacking} + *
+ * + * @author Benoit Lacelle + */ +public final class LongBinaryPacking implements LongCODEC, SkippableLongCODEC { + public final static int BLOCK_SIZE = 64; + private static final int MAX_BIT_WIDTH = Long.SIZE; + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + inlength = Util.greatestMultiple(inlength, BLOCK_SIZE); + if (inlength == 0) + return; + out[outpos.get()] = inlength; + outpos.increment(); + headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + inlength = Util.greatestMultiple(inlength, BLOCK_SIZE); + int tmpoutpos = outpos.get(); + int s = inpos.get(); + // Compress by block of 8 * 64 longs as much as possible + for (; s + BLOCK_SIZE * 8 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 8) { + // maxbits can be anything between 0 and 64 included: expressed within a byte (1 << 6) + final int mbits1 = LongUtil.maxbits(in, s + 0 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits2 = LongUtil.maxbits(in, s + 1 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits3 = LongUtil.maxbits(in, s + 2 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits4 = LongUtil.maxbits(in, s + 3 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits5 = LongUtil.maxbits(in, s + 4 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits6 = LongUtil.maxbits(in, s + 5 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits7 = LongUtil.maxbits(in, s + 6 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits8 = LongUtil.maxbits(in, s + 7 * BLOCK_SIZE, BLOCK_SIZE); + // The first long expressed the maxbits for the 8 buckets + out[tmpoutpos++] = ((long) mbits1 << 56) | ((long) mbits2 << 48) | ((long) mbits3 << 40) | ((long) mbits4 << 32) | (mbits5 << 24) | (mbits6 << 16) | (mbits7 << 8) | (mbits8); + LongBitPacking.fastpackwithoutmask(in, s + 0 * BLOCK_SIZE, out, tmpoutpos, (int) mbits1); + tmpoutpos += mbits1; + LongBitPacking.fastpackwithoutmask(in, s + 1 * BLOCK_SIZE, out, tmpoutpos, (int) mbits2); + tmpoutpos += mbits2; + LongBitPacking.fastpackwithoutmask(in, s + 2 * BLOCK_SIZE, out, tmpoutpos, (int) mbits3); + tmpoutpos += mbits3; + LongBitPacking.fastpackwithoutmask(in, s + 3 * BLOCK_SIZE, out, tmpoutpos, (int) mbits4); + tmpoutpos += mbits4; + LongBitPacking.fastpackwithoutmask(in, s + 4 * BLOCK_SIZE, out, tmpoutpos, (int) mbits5); + tmpoutpos += mbits5; + LongBitPacking.fastpackwithoutmask(in, s + 5 * BLOCK_SIZE, out, tmpoutpos, (int) mbits6); + tmpoutpos += mbits6; + LongBitPacking.fastpackwithoutmask(in, s + 6 * BLOCK_SIZE, out, tmpoutpos, (int) mbits7); + tmpoutpos += mbits7; + LongBitPacking.fastpackwithoutmask(in, s + 7 * BLOCK_SIZE, out, tmpoutpos, (int) mbits8); + tmpoutpos += mbits8; + } + // Then we compress up to 7 blocks of 64 longs + for (; s < inpos.get() + inlength; s += BLOCK_SIZE ) { + final int mbits = LongUtil.maxbits(in, s, BLOCK_SIZE); + out[tmpoutpos++] = mbits; + LongBitPacking.fastpackwithoutmask(in, s, out, tmpoutpos, mbits); + tmpoutpos += mbits; + } + inpos.add(inlength); + outpos.set(tmpoutpos); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + if (inlength == 0) + return; + final int outlength = (int) in[inpos.get()]; + inpos.increment(); + headlessUncompress(in,inpos, inlength,out,outpos,outlength); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos, int num) { + final int outlength = Util.greatestMultiple(num, BLOCK_SIZE); + int tmpinpos = inpos.get(); + int s = outpos.get(); + for (; s + BLOCK_SIZE * 8 - 1 < outpos.get() + outlength; s += BLOCK_SIZE * 8) { + final int mbits1 = (int) ((in[tmpinpos] >>> 56)); + final int mbits2 = (int) ((in[tmpinpos] >>> 48) & 0xFF); + final int mbits3 = (int) ((in[tmpinpos] >>> 40) & 0xFF); + final int mbits4 = (int) ((in[tmpinpos] >>> 32) & 0xFF); + final int mbits5 = (int) ((in[tmpinpos] >>> 24) & 0xFF); + final int mbits6 = (int) ((in[tmpinpos] >>> 16) & 0xFF); + final int mbits7 = (int) ((in[tmpinpos] >>> 8) & 0xFF); + final int mbits8 = (int) ((in[tmpinpos]) & 0xFF); + ++tmpinpos; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 0 * BLOCK_SIZE, mbits1); + tmpinpos += mbits1; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 1 * BLOCK_SIZE, mbits2); + tmpinpos += mbits2; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 2 * BLOCK_SIZE, mbits3); + tmpinpos += mbits3; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 3 * BLOCK_SIZE, mbits4); + tmpinpos += mbits4; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 4 * BLOCK_SIZE, mbits5); + tmpinpos += mbits5; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 5 * BLOCK_SIZE, mbits6); + tmpinpos += mbits6; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 6 * BLOCK_SIZE, mbits7); + tmpinpos += mbits7; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 7 * BLOCK_SIZE, mbits8); + tmpinpos += mbits8; + } + for (; s < outpos.get() + outlength; s += BLOCK_SIZE ) { + final int mbits = (int) in[tmpinpos]; + ++tmpinpos; + LongBitPacking.fastunpack(in, tmpinpos, out, s, mbits); + tmpinpos += mbits; + } + outpos.add(outlength); + inpos.set(tmpinpos); + } + + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int blockCount = inlength / BLOCK_SIZE; + int headersSizeInLongs = blockCount / Long.BYTES + (blockCount % Long.BYTES); + int blocksSizeInLongs = blockCount * MAX_BIT_WIDTH; + compressedPositions.add(blockCount * BLOCK_SIZE); + return headersSizeInLongs + blocksSizeInLongs; + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } +} diff --git a/src/main/java/me/lemire/longcompression/LongBitPacking.java b/src/main/java/me/lemire/longcompression/LongBitPacking.java new file mode 100644 index 0000000..2d282ec --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongBitPacking.java @@ -0,0 +1,146 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import java.util.Arrays; + +/** + * Bitpacking routines + * + *For details, please see
+ *+ * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second + * through vectorization Software: Practice & Experience + * http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract + * http://arxiv.org/abs/1209.2137 + *
+ * + * @author Benoit Lacelle + * + */ +public final class LongBitPacking { + + /** + * Pack 64 longs + * + * @param in + * source array + * @param inpos + * position in source array + * @param out + * output array + * @param outpos + * position in output array + * @param bit + * number of bits to use per long + */ + public static void fastpackwithoutmask(final long[] in, final int inpos, + final long[] out, final int outpos, final int bit) { + if (bit == 0) { + fastpackwithoutmask0(in, inpos, out, outpos); + } else if (bit == 64) { + fastpackwithoutmask64(in, inpos, out, outpos); + } else if (bit > 0 && bit < 64) { + slowpackwithoutmask(in, inpos, out, outpos, bit); + } else { + throw new IllegalArgumentException("Unsupported bit width: " + bit); + } + } + + protected static void fastpackwithoutmask0(final long[] in, int inpos, + final long[] out, int outpos) { + // nothing + } + + protected static void fastpackwithoutmask64(final long[] in, int inpos, + final long[] out, int outpos) { + System.arraycopy(in, inpos, out, outpos, 64); + } + + protected static void slowpackwithoutmask(final long[] in, int inpos, + final long[] out, int outpos, final int bit) { + int bucket = 0; + int shift = 0; + + out[outpos + bucket] = 0L; + for (int i = 0 ; i < 64 ; i++) { + if (shift >= 64) { + bucket++; + out[bucket + outpos] = 0L; + shift -= 64; + + if (shift > 0) { + // There is some leftovers from previous input in the next bucket + out[outpos + bucket] |= in[inpos + i - 1] >> (bit - shift); + } + } + out[outpos + bucket] |= in[inpos + i] << shift; + + shift += bit; + } + } + + + /** + * Unpack the 64 longs + * + * @param in + * source array + * @param inpos + * starting point in the source array + * @param out + * output array + * @param outpos + * starting point in the output array + * @param bit + * how many bits to use per integer + */ + public static void fastunpack(final long[] in, final int inpos, + final long[] out, final int outpos, final int bit) { + if (bit == 0) { + fastunpack0(in, inpos, out, outpos); + } else if (bit == 64) { + fastunpack64(in, inpos, out, outpos); + } else if (bit > 0 && bit < 64) { + slowunpack(in, inpos, out, outpos, bit); + } else { + throw new IllegalArgumentException("Unsupported bit width: " + bit); + } + } + + + protected static void fastunpack0(final long[] in, int inpos, + final long[] out, int outpos) { + Arrays.fill(out, outpos, outpos + 64, 0); + } + + protected static void fastunpack64(final long[] in, int inpos, + final long[] out, int outpos) { + System.arraycopy(in, inpos, out, outpos, 64); + } + + protected static void slowunpack(final long[] in, int inpos, + final long[] out, int outpos, final int bit) { + int bucket = 0; + int shift = 0; + for (int i = 0 ; i < 64 ; i++) { + if (shift >= 64) { + bucket++; + shift -= 64; + + if (shift > 0) { + // There is some leftovers from previous input in the next bucket + out[outpos + i - 1] |= (in[inpos + bucket] << (bit - shift) & ((1L << bit) - 1)); + } + } + out[outpos + i] = ((in[inpos + bucket] >>> shift) & ((1L << bit) - 1)); + + shift += bit; + } + } +} diff --git a/src/main/java/me/lemire/longcompression/LongCODEC.java b/src/main/java/me/lemire/longcompression/LongCODEC.java new file mode 100644 index 0000000..0951ffd --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongCODEC.java @@ -0,0 +1,62 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Interface describing a standard CODEC to compress longs. + * + * @author Benoit Lacelle + * + */ +public interface LongCODEC { + /** + * Compress data from an array to another array. + * + * Both inpos and outpos are modified to represent how much data was + * read and written to. If 12 longs (inlength = 12) are compressed to 3 + * longs, then inpos will be incremented by 12 while outpos will be + * incremented by 3. We use IntWrapper to pass the values by reference. + * + * @param in + * input array + * @param inpos + * where to start reading in the array + * @param inlength + * how many longs to compress + * @param out + * output array + * @param outpos + * where to write in the output array + */ + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos); + + /** + * Uncompress data from an array to another array. + * + * Both inpos and outpos parameters are modified to indicate new + * positions after read/write. + * + * @param in + * array containing data in compressed form + * @param inpos + * where to start reading in the array + * @param inlength + * length of the compressed data (ignored by some + * schemes) + * @param out + * array where to write the uncompressed output + * @param outpos + * where to start writing the uncompressed output in out + */ + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos); + +} diff --git a/src/main/java/me/lemire/longcompression/LongComposition.java b/src/main/java/me/lemire/longcompression/LongComposition.java new file mode 100644 index 0000000..5111a51 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongComposition.java @@ -0,0 +1,71 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Helper class to compose schemes. + * + * @author Benoit Lacelle + */ +public class LongComposition implements LongCODEC { + LongCODEC F1, F2; + + /** + * Compose a scheme from a first one (f1) and a second one (f2). The + * first one is called first and then the second one tries to compress + * whatever remains from the first run. + * + * By convention, the first scheme should be such that if, during + * decoding, a 32-bit zero is first encountered, then there is no + * output. + * + * @param f1 + * first codec + * @param f2 + * second codec + */ + public LongComposition(LongCODEC f1, LongCODEC f2) { + F1 = f1; + F2 = f2; + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + int inposInit = inpos.get(); + int outposInit = outpos.get(); + F1.compress(in, inpos, inlength, out, outpos); + if (outpos.get() == outposInit) { + out[outposInit] = 0; + outpos.increment(); + } + inlength -= inpos.get() - inposInit; + F2.compress(in, inpos, inlength, out, outpos); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + if (inlength == 0) + return; + final int init = inpos.get(); + F1.uncompress(in, inpos, inlength, out, outpos); + inlength -= inpos.get() - init; + F2.uncompress(in, inpos, inlength, out, outpos); + } + + @Override + public String toString() { + return F1.toString() + " + " + F2.toString(); + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongCompressor.java b/src/main/java/me/lemire/longcompression/LongCompressor.java new file mode 100644 index 0000000..246647f --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongCompressor.java @@ -0,0 +1,68 @@ +package me.lemire.longcompression; + +import java.util.Arrays; + +import me.lemire.integercompression.IntWrapper; + +/** + * This is a convenience class that wraps a codec to provide + * a "friendly" API. + * + * @author Benoit Lacelle + */ +public class LongCompressor { + + SkippableLongCODEC codec; + + /** + * Constructor wrapping a codec. + * + * @param c the underlying codec + */ + public LongCompressor(SkippableLongCODEC c) { + codec = c; + } + + /** + * Constructor with default codec. + */ + public LongCompressor() { + codec = new SkippableLongComposition(new LongBinaryPacking(), + new LongVariableByte()); + } + + /** + * Compress an array and returns the compressed result as a new array. + * + * @param input array to be compressed + * @return compressed array + */ + public long[] compress(long[] input) { + int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length); + long[] compressed = new long[maxCompressedLength + 1]; // +1 to store the length of the input + // Store at index=0 the length of the input, hence enabling .headlessCompress + compressed[0] = input.length; + IntWrapper outpos = new IntWrapper(1); + codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos); + compressed = Arrays.copyOf(compressed,outpos.intValue()); + return compressed; + } + + /** + * Uncompress an array and returns the uncompressed result as a new array. + * + * @param compressed compressed array + * @return uncompressed array + */ + public long[] uncompress(long[] compressed) { + // Read at index=0 the length of the input, hence enabling .headlessUncompress + long[] decompressed = new long[(int) compressed[0]]; + IntWrapper inpos = new IntWrapper(1); + codec.headlessUncompress(compressed, inpos, + compressed.length - inpos.intValue(), + decompressed, new IntWrapper(0), + decompressed.length); + return decompressed; + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongJustCopy.java b/src/main/java/me/lemire/longcompression/LongJustCopy.java new file mode 100644 index 0000000..95abc1e --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongJustCopy.java @@ -0,0 +1,58 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * @author Benoit lacelle + * + */ +public final class LongJustCopy implements LongCODEC, SkippableLongCODEC { + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + System.arraycopy(in, inpos.get(), out, outpos.get(), inlength); + inpos.add(inlength); + outpos.add(inlength); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + headlessUncompress(in,inpos,inlength,out,outpos,inlength); + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos, int num) { + System.arraycopy(in, inpos.get(), out, outpos.get(), num); + inpos.add(num); + outpos.add(num); + + } + + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + compressedPositions.add(inlength); + return inlength; + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + headlessCompress(in,inpos,inlength,out,outpos); + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongUtil.java b/src/main/java/me/lemire/longcompression/LongUtil.java new file mode 100644 index 0000000..7bdce83 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongUtil.java @@ -0,0 +1,52 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +/** + * These are unofficial helpers related to long compression + * + * @author Benoit Lacelle + * + */ +@Deprecated +public class LongUtil { + + /** + * Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range + * of value + * + * @param i + * source array + * @param pos + * starting position + * @param length + * number of integers to consider + * @return integer logarithm + */ + public static int maxbits(long[] i, int pos, int length) { + long mask = 0; + for (int k = pos; k < pos + length; ++k) + mask |= i[k]; + return bits(mask); + } + + /** + * Compute the integer logarithms (ceil(log(x+1)) of a value + * + * @param i + * source value + * @return integer logarithm + */ + public static int bits(long i) { + return 64 - Long.numberOfLeadingZeros(i); + } + + protected static String longToBinaryWithLeading(long l) { + return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0'); + } +} diff --git a/src/main/java/me/lemire/longcompression/LongVariableByte.java b/src/main/java/me/lemire/longcompression/LongVariableByte.java new file mode 100644 index 0000000..63c194b --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongVariableByte.java @@ -0,0 +1,348 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.LongBuffer; + +import me.lemire.integercompression.IntWrapper; + +/** + * Implementation of variable-byte. For best performance, use it using the + * ByteLongCODEC interface. + * + * Note that this does not use differential coding: if you are working on sorted + * lists, you must compute the deltas separately. + * + * @author Benoit Lacelle + */ +public class LongVariableByte implements LongCODEC, ByteLongCODEC, SkippableLongCODEC { + private static final int MAX_BYTES_PER_INT = 10; + + private static byte extract7bits(int i, long val) { + return (byte) ((val >>> (7 * i)) & ((1 << 7) - 1)); + } + + private static byte extract7bitsmaskless(int i, long val) { + return (byte) ((val >>> (7 * i))); + } + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + if (inlength == 0) + return; + // Worst case: we write 10 bytes per long, hence 2 longs for a long, hence 16 bytes per long + ByteBuffer buf = makeBuffer(inlength * 16); + buf.order(ByteOrder.LITTLE_ENDIAN); + for (int k = inpos.get(); k < inpos.get() + inlength; ++k) { + final long val = in[k]; + if (val >= 0 && val < (1 << 7)) { + buf.put((byte) (val | (1 << 7))); + } else if (val >= 0 && val < (1 << 14)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) (extract7bitsmaskless(1, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1 << 21)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) (extract7bitsmaskless(2, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1 << 28)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) (extract7bitsmaskless(3, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 35)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) (extract7bitsmaskless(4, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 42)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) (extract7bitsmaskless(5, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 49)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) (extract7bitsmaskless(6, (val)) | (1 << 7))); + } else if (val >= 0 && val < (1L << 56)) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) extract7bits(6, val)); + buf.put((byte) (extract7bitsmaskless(7, (val)) | (1 << 7))); + } else if (val >= 0) { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) extract7bits(6, val)); + buf.put((byte) extract7bits(7, val)); + buf.put((byte) (extract7bitsmaskless(8, (val)) | (1 << 7))); + } else { + buf.put((byte) extract7bits(0, val)); + buf.put((byte) extract7bits(1, val)); + buf.put((byte) extract7bits(2, val)); + buf.put((byte) extract7bits(3, val)); + buf.put((byte) extract7bits(4, val)); + buf.put((byte) extract7bits(5, val)); + buf.put((byte) extract7bits(6, val)); + buf.put((byte) extract7bits(7, val)); + buf.put((byte) extract7bits(8, val)); + buf.put((byte) (extract7bitsmaskless(9, (val)) | (1 << 7))); + } + } + while (buf.position() % 8 != 0) + buf.put((byte) 0); + final int length = buf.position(); + buf.flip(); + LongBuffer ibuf = buf.asLongBuffer(); + ibuf.get(out, outpos.get(), length / 8); + outpos.add(length / 8); + inpos.add(inlength); + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, byte[] out, + IntWrapper outpos) { + if (inlength == 0) + return; + int outpostmp = outpos.get(); + for (int k = inpos.get(); k < inpos.get() + inlength; ++k) { + final long val = in[k]; + if (val >= 0 && val < (1 << 7)) { + out[outpostmp++] = (byte) (val | (1 << 7)); + } else if (val >= 0 && val < (1 << 14)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(1, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1 << 21)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(2, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1 << 28)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(3, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 35)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(4, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 42)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(5, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 49)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(6, (val)) | (1 << 7)); + } else if (val >= 0 && val < (1L << 56)) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) extract7bits(6, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(7, (val)) | (1 << 7)); + } else if (val >= 0) { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) extract7bits(6, val); + out[outpostmp++] = (byte) extract7bits(7, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(8, (val)) | (1 << 7)); + } else { + out[outpostmp++] = (byte) extract7bits(0, val); + out[outpostmp++] = (byte) extract7bits(1, val); + out[outpostmp++] = (byte) extract7bits(2, val); + out[outpostmp++] = (byte) extract7bits(3, val); + out[outpostmp++] = (byte) extract7bits(4, val); + out[outpostmp++] = (byte) extract7bits(5, val); + out[outpostmp++] = (byte) extract7bits(6, val); + out[outpostmp++] = (byte) extract7bits(7, val); + out[outpostmp++] = (byte) extract7bits(8, val); + out[outpostmp++] = (byte) (extract7bitsmaskless(9, (val)) | (1 << 7)); + } + } + outpos.set(outpostmp); + inpos.add(inlength); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + int s = 0; + long val = 0; + int p = inpos.get(); + int finalp = inpos.get() + inlength; + int tmpoutpos = outpos.get(); + for (long v = 0, shift = 0; p < finalp;) { + val = in[p]; + long c = (byte) (val >>> s); + // Shift to next byte + s += 8; + // Shift to next long if s==64 + p += s>>6; + // Cycle from 63 to 0 + s = s & 63; + v += ((c & 127) << shift); + if ((c & 128) == 128) { + out[tmpoutpos++] = v; + v = 0; + shift = 0; + } else + shift += 7; + assert shift < 64; + } + outpos.set(tmpoutpos); + inpos.add(inlength); + } + + @Override + public void uncompress(byte[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + int p = inpos.get(); + int finalp = inpos.get() + inlength; + int tmpoutpos = outpos.get(); + for (long v = 0; p < finalp; out[tmpoutpos++] = v) { + v = in[p] & 0x7F; + if (in[p] < 0) { + p += 1; + continue; + } + v = ((in[p + 1] & 0x7F) << 7) | v; + if (in[p + 1] < 0) { + p += 2; + continue; + } + v = ((in[p + 2] & 0x7F) << 14) | v; + if (in[p + 2] < 0 ) { + p += 3; + continue; + } + v = ((in[p + 3] & 0x7F) << 21) | v; + if (in[p + 3] < 0) { + p += 4; + continue; + } + v = (((long) in[p + 4] & 0x7F) << 28) | v; + if (in[p + 4] < 0) { + p += 5; + continue; + } + v = (((long) in[p + 5] & 0x7F) << 35) | v; + if (in[p + 5] < 0) { + p += 6; + continue; + } + v = (((long) in[p + 6] & 0x7F) << 42) | v; + if (in[p + 6] < 0) { + p += 7; + continue; + } + v = (((long) in[p + 7] & 0x7F) << 49) | v; + if (in[p + 7] < 0) { + p += 8; + continue; + } + v = (((long) in[p + 8] & 0x7F) << 56) | v; + if (in[p + 8] < 0) { + p += 9; + continue; + } + v = (((long) in[p + 9] & 0x7F) << 63) | v; + p += 10; + } + outpos.set(tmpoutpos); + inpos.add(p); + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos, int num) { + int s = 0; + long val = 0; + int p = inpos.get(); + int tmpoutpos = outpos.get(); + int finaloutpos = num + tmpoutpos; + for (long v = 0, shift = 0; tmpoutpos < finaloutpos;) { + val = in[p]; + long c = val >>> s; + // Shift to next byte + s += 8; + // Shift to next long if s == 64 + p += s>>6; + // Cycle from 63 to 0 + s = s & 63; + v += ((c & 127) << shift); + if ((c & 128) == 128) { + out[tmpoutpos++] = v; + v = 0; + shift = 0; + } else + shift += 7; + assert shift < 64; + } + outpos.set(tmpoutpos); + inpos.set(p + (s!=0 ? 1 : 0)); + } + + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int maxLengthInBytes = inlength * MAX_BYTES_PER_INT; + int maxLengthInLongs = (maxLengthInBytes + Long.BYTES - 1) / Long.BYTES; + compressedPositions.add(inlength); + return maxLengthInLongs; + } + + /** + * Creates a new buffer of the requested size. + * + * In case you need a different way to allocate buffers, you can override this method + * with a custom behavior. The default implementation allocates a new Java direct + * {@link ByteBuffer} on each invocation. + */ + protected ByteBuffer makeBuffer(int sizeInBytes) { + return ByteBuffer.allocateDirect(sizeInBytes); + } +} diff --git a/src/main/java/me/lemire/longcompression/RoaringIntPacking.java b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java new file mode 100644 index 0000000..d6b6baa --- /dev/null +++ b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java @@ -0,0 +1,46 @@ +/* + * (c) the authors Licensed under the Apache License, Version 2.0. + */ +package me.lemire.longcompression; + +/** + * Used to hold the logic packing 2 integers in a long, and separating a long in two integers. It is + * useful in {@link Roaring64NavigableMap} as the implementation split the input long in two + * integers, one used as key of a NavigableMap while the other is added in a Bitmap + * + * @author Benoit Lacelle + * + */ +// Duplicated from RoaringBitmap +class RoaringIntPacking { + + /** + * + * @param id any long, positive or negative + * @return an int holding the 32 highest order bits of information of the input long + */ + public static int high(long id) { + return (int) (id >> 32); + } + + /** + * + * @param id any long, positive or negative + * @return an int holding the 32 lowest order bits of information of the input long + */ + public static int low(long id) { + return (int) id; + } + + /** + * + * @param high an integer representing the highest order bits of the output long + * @param low an integer representing the lowest order bits of the output long + * @return a long packing together the integers as computed by + * {@link RoaringIntPacking#high(long)} and {@link RoaringIntPacking#low(long)} + */ + // https://stackoverflow.com/questions/12772939/java-storing-two-ints-in-a-long + public static long pack(int high, int low) { + return (((long) high) << 32) | (low & 0xffffffffL); + } +} diff --git a/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java b/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java new file mode 100644 index 0000000..33fd562 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java @@ -0,0 +1,87 @@ +/** + * This is code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Interface describing a standard CODEC to compress longs. This is a + * variation on the LongCODEC interface meant to be used for random access + * (i.e., given a large array, you can segment it and decode just the subarray you need). + * + * The main difference is that we must specify the number of longs we wish to + * decode. This information should be stored elsewhere. + * + * This interface was designed by the Terrier team for their search engine. + * + * @author Benoit Lacelle + * + */ +public interface SkippableLongCODEC { + /** + * Compress data from an array to another array. + * + * Both inpos and outpos are modified to represent how much data was read + * and written to. If 12 longs (inlength = 12) are compressed to 3 longs, then + * inpos will be incremented by 12 while outpos will be incremented by 3. We + * use IntWrapper to pass the values by reference. + * + * @param in + * input array + * @param inpos + * where to start reading in the array + * @param inlength + * how many longs to compress + * @param out + * output array + * @param outpos + * where to write in the output array + */ + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos); + + /** + * Uncompress data from an array to another array. + * + * Both inpos and outpos parameters are modified to indicate new positions + * after read/write. + * + * @param in + * array containing data in compressed form + * @param inpos + * where to start reading in the array + * @param inlength + * length of the compressed data (ignored by some schemes) + * @param out + * array where to write the uncompressed output + * @param outpos + * where to start writing the uncompressed output in out + * @param num + * number of longs we want to decode, the actual number of longs decoded can be less + */ + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos, int num); + + /** + * Compute the maximum number of longs that might be required to store + * the compressed form of a given input array segment, without headers. + *+ * This is useful to pre-allocate the output buffer before calling + * {@link #headlessCompress(long[], IntWrapper, int, long[], IntWrapper)}. + *
+ * + * @param compressedPositions + * since not all schemes compress every input integer, this parameter + * returns how many input integers will actually be compressed. + * This is useful when composing multiple schemes. + * @param inlength + * number of longs to be compressed + * @return the maximum number of longs needed in the output array + */ + int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength); +} diff --git a/src/main/java/me/lemire/longcompression/SkippableLongComposition.java b/src/main/java/me/lemire/longcompression/SkippableLongComposition.java new file mode 100644 index 0000000..eb03b72 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/SkippableLongComposition.java @@ -0,0 +1,82 @@ +/** + * This is code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.longcompression; + +import me.lemire.integercompression.IntWrapper; + +/** + * Helper class to compose schemes. + * + * @author Benoit Lacelle + */ +public class SkippableLongComposition implements SkippableLongCODEC { + SkippableLongCODEC F1, F2; + + /** + * Compose a scheme from a first one (f1) and a second one (f2). The first + * one is called first and then the second one tries to compress whatever + * remains from the first run. + * + * By convention, the first scheme should be such that if, during decoding, + * a 32-bit zero is first encountered, then there is no output. + * + * @param f1 + * first codec + * @param f2 + * second codec + */ + public SkippableLongComposition(SkippableLongCODEC f1, + SkippableLongCODEC f2) { + F1 = f1; + F2 = f2; + } + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos) { + int init = inpos.get(); + int outposInit = outpos.get(); + F1.headlessCompress(in, inpos, inlength, out, outpos); + if (outpos.get() == outposInit) { + out[outposInit] = 0; + outpos.increment(); + } + inlength -= inpos.get() - init; + F2.headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out, + IntWrapper outpos, int num) { + int init = inpos.get(); + int outposInit = outpos.get(); + + F1.headlessUncompress(in, inpos, inlength, out, outpos, num); + if (inpos.get() == init) { + inpos.increment(); + } + inlength -= inpos.get() - init; + num -= outpos.get() - outposInit; + F2.headlessUncompress(in, inpos, inlength, out, outpos, num); + } + + @Override + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { + int init = compressedPositions.get(); + int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength); + maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version? + inlength -= compressedPositions.get() - init; + maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength); + return maxLength; + } + + @Override + public String toString() { + return F1.toString() + "+" + F2.toString(); + } + +} diff --git a/src/main/java/me/lemire/longcompression/differential/LongDelta.java b/src/main/java/me/lemire/longcompression/differential/LongDelta.java new file mode 100644 index 0000000..8399f94 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/differential/LongDelta.java @@ -0,0 +1,150 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression.differential; + +/** + * Generic class to compute differential coding. + * + * @author Benoit lacelle + * + */ +public final class LongDelta { + + /** + * Apply differential coding (in-place). + * + * @param data + * data to be modified + */ + public static void delta(long[] data) { + for (int i = data.length - 1; i > 0; --i) { + data[i] -= data[i - 1]; + } + } + + /** + * Apply differential coding (in-place) given an initial value. + * + * @param data + * data to be modified + * @param start + * starting index + * @param length + * number of integers to process + * @param init + * initial value + * @return next initial vale + */ + public static long delta(long[] data, int start, int length, int init) { + final long nextinit = data[start + length - 1]; + for (int i = length - 1; i > 0; --i) { + data[start + i] -= data[start + i - 1]; + } + data[start] -= init; + return nextinit; + } + + /** + * Compute differential coding given an initial value. Output is written + * to a provided array: must have length "length" or better. + * + * @param data + * data to be modified + * @param start + * starting index + * @param length + * number of integers to process + * @param init + * initial value + * @param out + * output array + * @return next initial vale + */ + public static long delta(long[] data, int start, int length, int init, + long[] out) { + for (int i = length - 1; i > 0; --i) { + out[i] = data[start + i] - data[start + i - 1]; + } + out[0] = data[start] - init; + return data[start + length - 1]; + } + + /** + * Undo differential coding (in-place). Effectively computes a prefix + * sum. + * + * @param data + * to be modified. + */ + public static void inverseDelta(long[] data) { + for (int i = 1; i < data.length; ++i) { + data[i] += data[i - 1]; + } + } + + /** + * Undo differential coding (in-place). Effectively computes a prefix + * sum. Like inverseDelta, only faster. + * + * @param data + * to be modified + */ + public static void fastinverseDelta(long[] data) { + int sz0 = data.length / 4 * 4; + int i = 1; + if (sz0 >= 4) { + long a = data[0]; + for (; i < sz0 - 4; i += 4) { + a = data[i] += a; + a = data[i + 1] += a; + a = data[i + 2] += a; + a = data[i + 3] += a; + } + } + + for (; i < data.length; ++i) { + data[i] += data[i - 1]; + } + } + + /** + * Undo differential coding (in-place). Effectively computes a prefix + * sum. Like inverseDelta, only faster. Uses an initial value. + * + * @param data + * to be modified + * @param start + * starting index + * @param length + * number of integers to process + * @param init + * initial value + * @return next initial value + */ + public static long fastinverseDelta(long[] data, int start, int length, + int init) { + data[start] += init; + int sz0 = length / 4 * 4; + int i = 1; + if (sz0 >= 4) { + long a = data[start]; + for (; i < sz0 - 4; i += 4) { + a = data[start + i] += a; + a = data[start + i + 1] += a; + a = data[start + i + 2] += a; + a = data[start + i + 3] += a; + } + } + + for (; i != length; ++i) { + data[start + i] += data[start + i - 1]; + } + return data[start + length - 1]; + } + +} diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java new file mode 100644 index 0000000..f134601 --- /dev/null +++ b/src/main/java/module-info.java @@ -0,0 +1,12 @@ +// Copyright (C) 2022 Intel Corporation + +// SPDX-License-Identifier: Apache-2.0 +module me.lemire.integercompression { + // This is currently only for advanced users: + // requires jdk.incubator.vector; + exports me.lemire.integercompression; + exports me.lemire.longcompression; + exports me.lemire.longcompression.differential; + exports me.lemire.integercompression.differential; + // exports me.lemire.integercompression.vector; +} diff --git a/src/test/java/me/lemire/integercompression/AdhocTest.java b/src/test/java/me/lemire/integercompression/AdhocTest.java index bced6c0..ee911b3 100644 --- a/src/test/java/me/lemire/integercompression/AdhocTest.java +++ b/src/test/java/me/lemire/integercompression/AdhocTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import org.junit.Assert; @@ -15,13 +22,48 @@ @SuppressWarnings({ "static-method" }) public class AdhocTest { - - /** - * - */ + @Test + public void testIssue59() { + FastPFOR128 fastpfor = new FastPFOR128(); + + int N = 9984; + int[] data = new int[N]; + for (var i = 0; i < N; i += 150) { + data[i] = i; + } + + int[] compressedoutput1 = new int[N + 1024]; + + IntWrapper inputoffset1 = new IntWrapper(0); + IntWrapper outputoffset1 = new IntWrapper(0); + + fastpfor.compress(data, inputoffset1, N, compressedoutput1, outputoffset1); + int compressedsize1 = outputoffset1.get(); + + int[] recovered1 = new int[N]; + inputoffset1 = new IntWrapper(0); + outputoffset1 = new IntWrapper(0); + fastpfor.uncompress(compressedoutput1, outputoffset1, compressedsize1, recovered1, inputoffset1); + Assert.assertArrayEquals(data, recovered1); + + int[] compressedoutput2 = new int[N + 1024]; + + IntWrapper inputoffset2 = new IntWrapper(0); + IntWrapper outputoffset2 = new IntWrapper(0); + + fastpfor.compress(data, inputoffset2, N, compressedoutput2, outputoffset2); + int compressedsize2 = outputoffset2.get(); + + int[] recovered2 = new int[N]; + inputoffset2 = new IntWrapper(0); + outputoffset2 = new IntWrapper(0); + fastpfor.uncompress(compressedoutput2, outputoffset2, compressedsize2, recovered2, inputoffset2); + Assert.assertArrayEquals(data, recovered2); + } + @Test public void testIssue29() { - for(int x = 0; x < 64; x++) { + for(int x = 0; x < 64; x++) { int[] a = {2, 3, 4, 5}; int[] b = new int[90]; int[] c = new int[a.length]; @@ -35,7 +77,7 @@ public void testIssue29() { IntWrapper cOffset = new IntWrapper(0); codec.uncompress(b, bOffset, len, c, cOffset); Assert.assertArrayEquals(a,c); - } + } } /** @@ -43,20 +85,20 @@ public void testIssue29() { */ @Test public void testIssue29b() { - for(int x = 0; x < 64; x++) { - int[] a = {2, 3, 4, 5}; - int[] b = new int[90]; - int[] c = new int[a.length]; - SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte()); - IntWrapper aOffset = new IntWrapper(0); - IntWrapper bOffset = new IntWrapper(x); - codec.headlessCompress(a, aOffset, a.length, b, bOffset); - int len = bOffset.get() - x; - bOffset.set(x); - IntWrapper cOffset = new IntWrapper(0); - codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length); - Assert.assertArrayEquals(a,c); - } + for(int x = 0; x < 64; x++) { + SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte()); + int[] a = {2, 3, 4, 5}; + int[] b = new int[x + codec.maxHeadlessCompressedLength(new IntWrapper(0), a.length)]; + int[] c = new int[a.length]; + IntWrapper aOffset = new IntWrapper(0); + IntWrapper bOffset = new IntWrapper(x); + codec.headlessCompress(a, aOffset, a.length, b, bOffset); + int len = bOffset.get() - x; + bOffset.set(x); + IntWrapper cOffset = new IntWrapper(0); + codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length); + Assert.assertArrayEquals(a,c); + } } @@ -64,30 +106,27 @@ public void testIssue29b() { * */ @Test - public void testIssue41() { - for (int x = 0; x < 64; x++) { - int[] a = { 2, 3, 4, 5 }; - int[] b = new int[90]; - int[] c = new int[a.length]; - SkippableIntegratedIntegerCODEC codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(), - new IntegratedVariableByte()); - IntWrapper aOffset = new IntWrapper(0); - IntWrapper bOffset = new IntWrapper(x); - IntWrapper initValue = new IntWrapper(0); - - codec.headlessCompress(a, aOffset, a.length, b, bOffset, initValue); - int len = bOffset.get() - x; - bOffset.set(x); - IntWrapper cOffset = new IntWrapper(0); - initValue = new IntWrapper(0); - codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length, initValue); - Assert.assertArrayEquals(a, c); - } - } + public void testIssue41() { + for (int x = 0; x < 64; x++) { + SkippableIntegratedIntegerCODEC codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(), + new IntegratedVariableByte()); + int[] a = { 2, 3, 4, 5 }; + int[] b = new int[x + codec.maxHeadlessCompressedLength(new IntWrapper(0), a.length)]; + int[] c = new int[a.length]; + IntWrapper aOffset = new IntWrapper(0); + IntWrapper bOffset = new IntWrapper(x); + IntWrapper initValue = new IntWrapper(0); + + codec.headlessCompress(a, aOffset, a.length, b, bOffset, initValue); + int len = bOffset.get() - x; + bOffset.set(x); + IntWrapper cOffset = new IntWrapper(0); + initValue = new IntWrapper(0); + codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length, initValue); + Assert.assertArrayEquals(a, c); + } + } - /** - * a test - */ @Test public void biggerCompressedArray0() { // No problem: for comparison. @@ -95,12 +134,8 @@ public void biggerCompressedArray0() { assertSymmetry(c, 0, 16384); c = new Composition(new FastPFOR(), new VariableByte()); assertSymmetry(c, 0, 16384); - } - /** - * a test - */ @Test public void biggerCompressedArray1() { // Compressed array is bigger than original, because of VariableByte. @@ -108,9 +143,6 @@ public void biggerCompressedArray1() { assertSymmetry(c, -1); } - /** - * a test - */ @Test public void biggerCompressedArray2() { // Compressed array is bigger than original, because of Composition. diff --git a/src/test/java/me/lemire/integercompression/BasicTest.java b/src/test/java/me/lemire/integercompression/BasicTest.java index e88293e..b29ae0d 100644 --- a/src/test/java/me/lemire/integercompression/BasicTest.java +++ b/src/test/java/me/lemire/integercompression/BasicTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; @@ -22,7 +29,7 @@ */ @SuppressWarnings({ "static-method" }) public class BasicTest { - IntegerCODEC[] codecs = { + final IntegerCODEC[] codecs = { new IntegratedComposition(new IntegratedBinaryPacking(), new IntegratedVariableByte()), new JustCopy(), @@ -41,35 +48,35 @@ public class BasicTest { new GroupSimple9(), new Composition(new XorBinaryPacking(), new VariableByte()), new Composition(new DeltaZigzagBinaryPacking(), - new DeltaZigzagVariableByte()) }; + new DeltaZigzagVariableByte()) }; - /** - * + /** + * This tests with a compressed array with various offset */ - @Test - public void saulTest() { - for (IntegerCODEC C : codecs) { - for (int x = 0; x < 50; ++x) { - int[] a = { 2, 3, 4, 5 }; - int[] b = new int[90]; - int[] c = new int[a.length]; - - IntWrapper aOffset = new IntWrapper(0); - IntWrapper bOffset = new IntWrapper(x); - C.compress(a, aOffset, a.length, b, bOffset); - int len = bOffset.get() - x; - - bOffset.set(x); - IntWrapper cOffset = new IntWrapper(0); - C.uncompress(b, bOffset, len, c, cOffset); - if(!Arrays.equals(a, c)) { - System.out.println("Problem with "+C); - } - assertArrayEquals(a, c); - - } - } - } + @Test + public void saulTest() { + for (IntegerCODEC C : codecs) { + for (int x = 0; x < 50; ++x) { + int[] a = { 2, 3, 4, 5 }; + int[] b = new int[90]; + int[] c = new int[a.length]; + + IntWrapper aOffset = new IntWrapper(0); + IntWrapper bOffset = new IntWrapper(x); + C.compress(a, aOffset, a.length, b, bOffset); + int len = bOffset.get() - x; + + bOffset.set(x); + IntWrapper cOffset = new IntWrapper(0); + C.uncompress(b, bOffset, len, c, cOffset); + if(!Arrays.equals(a, c)) { + System.out.println("Problem with "+C); + } + assertArrayEquals(a, c); + + } + } + } /** * */ diff --git a/src/test/java/me/lemire/integercompression/BoundaryTest.java b/src/test/java/me/lemire/integercompression/BoundaryTest.java index ede2e9f..128b431 100644 --- a/src/test/java/me/lemire/integercompression/BoundaryTest.java +++ b/src/test/java/me/lemire/integercompression/BoundaryTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; diff --git a/src/test/java/me/lemire/integercompression/ByteBasicTest.java b/src/test/java/me/lemire/integercompression/ByteBasicTest.java index c2f5b6f..2b2d4f1 100644 --- a/src/test/java/me/lemire/integercompression/ByteBasicTest.java +++ b/src/test/java/me/lemire/integercompression/ByteBasicTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; @@ -21,32 +28,32 @@ public class ByteBasicTest { new IntegratedVariableByte(), }; - /** + /** * */ - @Test - public void saulTest() { - for (ByteIntegerCODEC C : codecs) { - for (int x = 0; x < 50 * 4; ++x) { - int[] a = { 2, 3, 4, 5 }; - byte[] b = new byte[90*4]; - int[] c = new int[a.length]; + @Test + public void saulTest() { + for (ByteIntegerCODEC C : codecs) { + for (int x = 0; x < 50 * 4; ++x) { + int[] a = { 2, 3, 4, 5 }; + byte[] b = new byte[90*4]; + int[] c = new int[a.length]; - IntWrapper aOffset = new IntWrapper(0); - IntWrapper bOffset = new IntWrapper(x); - C.compress(a, aOffset, a.length, b, bOffset); - int len = bOffset.get() - x; + IntWrapper aOffset = new IntWrapper(0); + IntWrapper bOffset = new IntWrapper(x); + C.compress(a, aOffset, a.length, b, bOffset); + int len = bOffset.get() - x; - bOffset.set(x); - IntWrapper cOffset = new IntWrapper(0); - C.uncompress(b, bOffset, len, c, cOffset); - if(!Arrays.equals(a, c)) { - System.out.println("Problem with "+C); - } - assertArrayEquals(a, c); - } - } - } + bOffset.set(x); + IntWrapper cOffset = new IntWrapper(0); + C.uncompress(b, bOffset, len, c, cOffset); + if(!Arrays.equals(a, c)) { + System.out.println("Problem with "+C); + } + assertArrayEquals(a, c); + } + } + } /** * */ diff --git a/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java b/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java index 5e0923d..ae42c1d 100644 --- a/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java +++ b/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java @@ -1,7 +1,10 @@ -/* +/** * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ */ + package me.lemire.integercompression; import org.junit.Test; diff --git a/src/test/java/me/lemire/integercompression/ExampleTest.java b/src/test/java/me/lemire/integercompression/ExampleTest.java index 300983c..c63c69b 100644 --- a/src/test/java/me/lemire/integercompression/ExampleTest.java +++ b/src/test/java/me/lemire/integercompression/ExampleTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import me.lemire.integercompression.differential.*; @@ -10,305 +17,303 @@ * */ public class ExampleTest { - /** - * - */ - @Test - - public void superSimpleExample() { - IntegratedIntCompressor iic = new IntegratedIntCompressor(); - int[] data = new int[2342351]; - for (int k = 0; k < data.length; ++k) - data[k] = k; - System.out.println("Compressing " + data.length + " integers using friendly interface"); - int[] compressed = iic.compress(data); - int[] recov = iic.uncompress(compressed); - System.out - .println("compressed from " + data.length * 4 / 1024 + "KB to " + compressed.length * 4 / 1024 + "KB"); - if (!Arrays.equals(recov, data)) - throw new RuntimeException("bug"); - } - - /** - * - */ - @Test - - public void basicExample() { - int[] data = new int[2342351]; - System.out.println("Compressing " + data.length + " integers in one go"); - // data should be sorted for best - // results - for (int k = 0; k < data.length; ++k) - data[k] = k; - // Very important: the data is in sorted order!!! If not, you - // will get very poor compression with IntegratedBinaryPacking, - // you should use another CODEC. - - // next we compose a CODEC. Most of the processing - // will be done with binary packing, and leftovers will - // be processed using variable byte - IntegratedIntegerCODEC codec = new IntegratedComposition(new IntegratedBinaryPacking(), - new IntegratedVariableByte()); - // output vector should be large enough... - int[] compressed = new int[data.length + 1024]; - // compressed might not be large enough in some cases - // if you get java.lang.ArrayIndexOutOfBoundsException, try - // allocating more memory - - /** - * - * compressing - * - */ - IntWrapper inputoffset = new IntWrapper(0); - IntWrapper outputoffset = new IntWrapper(0); - codec.compress(data, inputoffset, data.length, compressed, outputoffset); - // got it! - // inputoffset should be at data.length but outputoffset tells - // us where we are... - System.out.println( - "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB"); - // we can repack the data: (optional) - compressed = Arrays.copyOf(compressed, outputoffset.intValue()); - - /** - * - * now uncompressing - * - * This assumes that we otherwise know how many integers have been - * compressed. See basicExampleHeadless for a more general case. - */ - int[] recovered = new int[data.length]; - IntWrapper recoffset = new IntWrapper(0); - codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset); - if (Arrays.equals(data, recovered)) - System.out.println("data is recovered without loss"); - else - throw new RuntimeException("bug"); // could use assert - System.out.println(); - } - - /** - * Like the basicExample, but we store the input array size manually. - */ - @Test - public void basicExampleHeadless() { - int[] data = new int[2342351]; - System.out.println("Compressing " + data.length + " integers in one go using the headless approach"); - // data should be sorted for best - // results - for (int k = 0; k < data.length; ++k) - data[k] = k; - // Very important: the data is in sorted order!!! If not, you - // will get very poor compression with IntegratedBinaryPacking, - // you should use another CODEC. - - // next we compose a CODEC. Most of the processing - // will be done with binary packing, and leftovers will - // be processed using variable byte - SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(), - new IntegratedVariableByte()); - // output vector should be large enough... - int[] compressed = new int[data.length + 1024]; - // compressed might not be large enough in some cases - // if you get java.lang.ArrayIndexOutOfBoundsException, try - // allocating more memory - - /** - * - * compressing - * - */ - IntWrapper inputoffset = new IntWrapper(0); - IntWrapper outputoffset = new IntWrapper(1); - compressed[0] = data.length; // we manually store how many integers we - codec.headlessCompress(data, inputoffset, data.length, compressed, outputoffset, new IntWrapper(0)); - // got it! - // inputoffset should be at data.length but outputoffset tells - // us where we are... - System.out.println( - "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB"); - // we can repack the data: (optional) - compressed = Arrays.copyOf(compressed, outputoffset.intValue()); - - /** - * - * now uncompressing - * - */ - int howmany = compressed[0];// we manually stored the number of - // compressed integers - int[] recovered = new int[howmany]; - IntWrapper recoffset = new IntWrapper(0); - codec.headlessUncompress(compressed, new IntWrapper(1), compressed.length, recovered, recoffset, howmany, new IntWrapper(0)); - if (Arrays.equals(data, recovered)) - System.out.println("data is recovered without loss"); - else - throw new RuntimeException("bug"); // could use assert - System.out.println(); - } - - /** - * This is an example to show you can compress unsorted integers as long as - * most are small. - */ - @Test - public void unsortedExample() { - final int N = 1333333; - int[] data = new int[N]; - // initialize the data (most will be small - for (int k = 0; k < N; k += 1) - data[k] = 3; - // throw some larger values - for (int k = 0; k < N; k += 5) - data[k] = 100; - for (int k = 0; k < N; k += 533) - data[k] = 10000; - int[] compressed = new int[N + 1024];// could need more - IntegerCODEC codec = new Composition(new FastPFOR(), new VariableByte()); - // compressing - IntWrapper inputoffset = new IntWrapper(0); - IntWrapper outputoffset = new IntWrapper(0); - codec.compress(data, inputoffset, data.length, compressed, outputoffset); - System.out.println("compressed unsorted integers from " + data.length * 4 / 1024 + "KB to " - + outputoffset.intValue() * 4 / 1024 + "KB"); - // we can repack the data: (optional) - compressed = Arrays.copyOf(compressed, outputoffset.intValue()); - - int[] recovered = new int[N]; - IntWrapper recoffset = new IntWrapper(0); - codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset); - if (Arrays.equals(data, recovered)) - System.out.println("data is recovered without loss"); - else - throw new RuntimeException("bug"); // could use assert - System.out.println(); - - } - - /** - * This is like the basic example, but we show how to process larger arrays - * in chunks. - * - * Some of this code was written by Pavel Klinov. - */ - @Test - public void advancedExample() { - int TotalSize = 2342351; // some arbitrary number - int ChunkSize = 16384; // size of each chunk, choose a multiple of 128 - System.out.println("Compressing " + TotalSize + " integers using chunks of " + ChunkSize + " integers (" - + ChunkSize * 4 / 1024 + "KB)"); - System.out.println("(It is often better for applications to work in chunks fitting in CPU cache.)"); - int[] data = new int[TotalSize]; - // data should be sorted for best - // results - for (int k = 0; k < data.length; ++k) - data[k] = k; - // next we compose a CODEC. Most of the processing - // will be done with binary packing, and leftovers will - // be processed using variable byte, using variable byte - // only for the last chunk! - IntegratedIntegerCODEC regularcodec = new IntegratedBinaryPacking(); - IntegratedVariableByte ivb = new IntegratedVariableByte(); - IntegratedIntegerCODEC lastcodec = new IntegratedComposition(regularcodec, ivb); - // output vector should be large enough... - int[] compressed = new int[TotalSize + 1024]; - - /** - * - * compressing - * - */ - IntWrapper inputoffset = new IntWrapper(0); - IntWrapper outputoffset = new IntWrapper(0); - for (int k = 0; k < TotalSize / ChunkSize; ++k) - regularcodec.compress(data, inputoffset, ChunkSize, compressed, outputoffset); - lastcodec.compress(data, inputoffset, TotalSize % ChunkSize, compressed, outputoffset); - // got it! - // inputoffset should be at data.length but outputoffset tells - // us where we are... - System.out.println( - "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB"); - // we can repack the data: - compressed = Arrays.copyOf(compressed, outputoffset.intValue()); - - /** - * - * now uncompressing - * - * We are *not* assuming that the original array length is known, - * however we assume that the chunk size (ChunkSize) is known. - * - */ - int[] recovered = new int[ChunkSize]; - IntWrapper compoff = new IntWrapper(0); - IntWrapper recoffset; - int currentpos = 0; - - while (compoff.get() < compressed.length) { - recoffset = new IntWrapper(0); - regularcodec.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset); - - if (recoffset.get() < ChunkSize) {// last chunk detected - ivb.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset); - } - for (int i = 0; i < recoffset.get(); ++i) { - if (data[currentpos + i] != recovered[i]) - throw new RuntimeException("bug"); // could use assert - } - currentpos += recoffset.get(); - } - System.out.println("data is recovered without loss"); - System.out.println(); - - } - - /** - * Demo of the headless approach where we must supply the array length - */ - @Test - public void headlessDemo() { - System.out.println("Compressing arrays with minimal header..."); - int[] uncompressed1 = { 1, 2, 1, 3, 1 }; - int[] uncompressed2 = { 3, 2, 4, 6, 1 }; - - int[] compressed = new int[uncompressed1.length + uncompressed2.length + 1024]; - - SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte()); - - // compressing - IntWrapper outPos = new IntWrapper(); - - IntWrapper previous = new IntWrapper(); - - codec.headlessCompress(uncompressed1, new IntWrapper(), uncompressed1.length, compressed, outPos); - int length1 = outPos.get() - previous.get(); - previous = new IntWrapper(outPos.get()); - codec.headlessCompress(uncompressed2, new IntWrapper(), uncompressed2.length, compressed, outPos); - int length2 = outPos.get() - previous.get(); - - compressed = Arrays.copyOf(compressed, length1 + length2); - System.out - .println("compressed unsorted integers from " + uncompressed1.length * 4 + "B to " + length1 * 4 + "B"); - System.out - .println("compressed unsorted integers from " + uncompressed2.length * 4 + "B to " + length2 * 4 + "B"); - System.out.println("Total compressed output " + compressed.length); - - int[] recovered1 = new int[uncompressed1.length]; - int[] recovered2 = new int[uncompressed1.length]; - IntWrapper inPos = new IntWrapper(); - System.out.println("Decoding first array starting at pos = " + inPos); - codec.headlessUncompress(compressed, inPos, compressed.length, recovered1, new IntWrapper(0), - uncompressed1.length); - System.out.println("Decoding second array starting at pos = " + inPos); - codec.headlessUncompress(compressed, inPos, compressed.length, recovered2, new IntWrapper(0), - uncompressed2.length); - if (!Arrays.equals(uncompressed1, recovered1)) - throw new RuntimeException("First array does not match."); - if (!Arrays.equals(uncompressed2, recovered2)) - throw new RuntimeException("Second array does not match."); - System.out.println("The arrays match, your code is probably ok."); - - } + /** + * + */ + @Test + + public void superSimpleExample() { + IntegratedIntCompressor iic = new IntegratedIntCompressor(); + int[] data = new int[2342351]; + for (int k = 0; k < data.length; ++k) + data[k] = k; + System.out.println("Compressing " + data.length + " integers using friendly interface"); + int[] compressed = iic.compress(data); + int[] recov = iic.uncompress(compressed); + System.out + .println("compressed from " + data.length * 4 / 1024 + "KB to " + compressed.length * 4 / 1024 + "KB"); + if (!Arrays.equals(recov, data)) + throw new RuntimeException("bug"); + } + + /** + * + */ + @Test + + public void basicExample() { + int[] data = new int[2342351]; + System.out.println("Compressing " + data.length + " integers in one go"); + // data should be sorted for best + // results + for (int k = 0; k < data.length; ++k) + data[k] = k; + // Very important: the data is in sorted order!!! If not, you + // will get very poor compression with IntegratedBinaryPacking, + // you should use another CODEC. + + // next we compose a CODEC. Most of the processing + // will be done with binary packing, and leftovers will + // be processed using variable byte + IntegratedIntegerCODEC codec = new IntegratedComposition(new IntegratedBinaryPacking(), + new IntegratedVariableByte()); + // output vector should be large enough... + int[] compressed = new int[data.length + 1024]; + // compressed might not be large enough in some cases + // if you get java.lang.ArrayIndexOutOfBoundsException, try + // allocating more memory + + /** + * + * compressing + * + */ + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(0); + codec.compress(data, inputoffset, data.length, compressed, outputoffset); + // got it! + // inputoffset should be at data.length but outputoffset tells + // us where we are... + System.out.println( + "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB"); + // we can repack the data: (optional) + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + /** + * + * now uncompressing + * + * This assumes that we otherwise know how many integers have been + * compressed. See basicExampleHeadless for a more general case. + */ + int[] recovered = new int[data.length]; + IntWrapper recoffset = new IntWrapper(0); + codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset); + if (Arrays.equals(data, recovered)) + System.out.println("data is recovered without loss"); + else + throw new RuntimeException("bug"); // could use assert + System.out.println(); + } + + /** + * Like the basicExample, but we store the input array size manually. + */ + @Test + public void basicExampleHeadless() { + int[] data = new int[2342351]; + System.out.println("Compressing " + data.length + " integers in one go using the headless approach"); + // data should be sorted for best + // results + for (int k = 0; k < data.length; ++k) + data[k] = k; + // Very important: the data is in sorted order!!! If not, you + // will get very poor compression with IntegratedBinaryPacking, + // you should use another CODEC. + + // next we compose a CODEC. Most of the processing + // will be done with binary packing, and leftovers will + // be processed using variable byte + SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(), + new IntegratedVariableByte()); + int[] compressed = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), data.length)]; + + /** + * + * compressing + * + */ + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(1); + compressed[0] = data.length; // we manually store how many integers we + codec.headlessCompress(data, inputoffset, data.length, compressed, outputoffset, new IntWrapper(0)); + // got it! + // inputoffset should be at data.length but outputoffset tells + // us where we are... + System.out.println( + "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB"); + // we can repack the data: (optional) + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + /** + * + * now uncompressing + * + */ + int howmany = compressed[0];// we manually stored the number of + // compressed integers + int[] recovered = new int[howmany]; + IntWrapper recoffset = new IntWrapper(0); + codec.headlessUncompress(compressed, new IntWrapper(1), compressed.length, recovered, recoffset, howmany, new IntWrapper(0)); + if (Arrays.equals(data, recovered)) + System.out.println("data is recovered without loss"); + else + throw new RuntimeException("bug"); // could use assert + System.out.println(); + } + + /** + * This is an example to show you can compress unsorted integers as long as + * most are small. + */ + @Test + public void unsortedExample() { + final int N = 1333333; + int[] data = new int[N]; + // initialize the data (most will be small + for (int k = 0; k < N; k += 1) + data[k] = 3; + // throw some larger values + for (int k = 0; k < N; k += 5) + data[k] = 100; + for (int k = 0; k < N; k += 533) + data[k] = 10000; + int[] compressed = new int[N + 1024];// could need more + IntegerCODEC codec = new Composition(new FastPFOR(), new VariableByte()); + // compressing + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(0); + codec.compress(data, inputoffset, data.length, compressed, outputoffset); + System.out.println("compressed unsorted integers from " + data.length * 4 / 1024 + "KB to " + + outputoffset.intValue() * 4 / 1024 + "KB"); + // we can repack the data: (optional) + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + int[] recovered = new int[N]; + IntWrapper recoffset = new IntWrapper(0); + codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset); + if (Arrays.equals(data, recovered)) + System.out.println("data is recovered without loss"); + else + throw new RuntimeException("bug"); // could use assert + System.out.println(); + + } + + /** + * This is like the basic example, but we show how to process larger arrays + * in chunks. + * + * Some of this code was written by Pavel Klinov. + */ + @Test + public void advancedExample() { + int TotalSize = 2342351; // some arbitrary number + int ChunkSize = 16384; // size of each chunk, choose a multiple of 128 + System.out.println("Compressing " + TotalSize + " integers using chunks of " + ChunkSize + " integers (" + + ChunkSize * 4 / 1024 + "KB)"); + System.out.println("(It is often better for applications to work in chunks fitting in CPU cache.)"); + int[] data = new int[TotalSize]; + // data should be sorted for best + // results + for (int k = 0; k < data.length; ++k) + data[k] = k; + // next we compose a CODEC. Most of the processing + // will be done with binary packing, and leftovers will + // be processed using variable byte, using variable byte + // only for the last chunk! + IntegratedIntegerCODEC regularcodec = new IntegratedBinaryPacking(); + IntegratedVariableByte ivb = new IntegratedVariableByte(); + IntegratedIntegerCODEC lastcodec = new IntegratedComposition(regularcodec, ivb); + // output vector should be large enough... + int[] compressed = new int[TotalSize + 1024]; + + /** + * + * compressing + * + */ + IntWrapper inputoffset = new IntWrapper(0); + IntWrapper outputoffset = new IntWrapper(0); + for (int k = 0; k < TotalSize / ChunkSize; ++k) + regularcodec.compress(data, inputoffset, ChunkSize, compressed, outputoffset); + lastcodec.compress(data, inputoffset, TotalSize % ChunkSize, compressed, outputoffset); + // got it! + // inputoffset should be at data.length but outputoffset tells + // us where we are... + System.out.println( + "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB"); + // we can repack the data: + compressed = Arrays.copyOf(compressed, outputoffset.intValue()); + + /** + * + * now uncompressing + * + * We are *not* assuming that the original array length is known, + * however we assume that the chunk size (ChunkSize) is known. + * + */ + int[] recovered = new int[ChunkSize]; + IntWrapper compoff = new IntWrapper(0); + IntWrapper recoffset; + int currentpos = 0; + + while (compoff.get() < compressed.length) { + recoffset = new IntWrapper(0); + regularcodec.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset); + + if (recoffset.get() < ChunkSize) {// last chunk detected + ivb.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset); + } + for (int i = 0; i < recoffset.get(); ++i) { + if (data[currentpos + i] != recovered[i]) + throw new RuntimeException("bug"); // could use assert + } + currentpos += recoffset.get(); + } + System.out.println("data is recovered without loss"); + System.out.println(); + + } + + /** + * Demo of the headless approach where we must supply the array length + */ + @Test + public void headlessDemo() { + System.out.println("Compressing arrays with minimal header..."); + int[] uncompressed1 = { 1, 2, 1, 3, 1 }; + int[] uncompressed2 = { 3, 2, 4, 6, 1 }; + + SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte()); + + int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed1.length) + + codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed2.length); + int[] compressed = new int[maxCompressedLength]; + + // compressing + IntWrapper outPos = new IntWrapper(); + + IntWrapper previous = new IntWrapper(); + + codec.headlessCompress(uncompressed1, new IntWrapper(), uncompressed1.length, compressed, outPos); + int length1 = outPos.get() - previous.get(); + previous = new IntWrapper(outPos.get()); + codec.headlessCompress(uncompressed2, new IntWrapper(), uncompressed2.length, compressed, outPos); + int length2 = outPos.get() - previous.get(); + + compressed = Arrays.copyOf(compressed, length1 + length2); + System.out + .println("compressed unsorted integers from " + uncompressed1.length * 4 + "B to " + length1 * 4 + "B"); + System.out + .println("compressed unsorted integers from " + uncompressed2.length * 4 + "B to " + length2 * 4 + "B"); + System.out.println("Total compressed output " + compressed.length); + + int[] recovered1 = new int[uncompressed1.length]; + int[] recovered2 = new int[uncompressed1.length]; + IntWrapper inPos = new IntWrapper(); + System.out.println("Decoding first array starting at pos = " + inPos); + codec.headlessUncompress(compressed, inPos, compressed.length, recovered1, new IntWrapper(0), + uncompressed1.length); + System.out.println("Decoding second array starting at pos = " + inPos); + codec.headlessUncompress(compressed, inPos, compressed.length, recovered2, new IntWrapper(0), + uncompressed2.length); + if (!Arrays.equals(uncompressed1, recovered1)) + throw new RuntimeException("First array does not match."); + if (!Arrays.equals(uncompressed2, recovered2)) + throw new RuntimeException("Second array does not match."); + System.out.println("The arrays match, your code is probably ok."); + + } } diff --git a/src/test/java/me/lemire/integercompression/IntCompressorTest.java b/src/test/java/me/lemire/integercompression/IntCompressorTest.java index 34b8946..79e51fc 100644 --- a/src/test/java/me/lemire/integercompression/IntCompressorTest.java +++ b/src/test/java/me/lemire/integercompression/IntCompressorTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.Arrays; diff --git a/src/test/java/me/lemire/integercompression/ResourcedTest.java b/src/test/java/me/lemire/integercompression/ResourcedTest.java index 61b8e58..8316129 100644 --- a/src/test/java/me/lemire/integercompression/ResourcedTest.java +++ b/src/test/java/me/lemire/integercompression/ResourcedTest.java @@ -1,3 +1,10 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + package me.lemire.integercompression; import java.util.ArrayList; @@ -17,65 +24,65 @@ * */ public class ResourcedTest { - SkippableIntegerCODEC[] codecs = { new JustCopy(), new VariableByte(), - new SkippableComposition(new BinaryPacking(), new VariableByte()), - new SkippableComposition(new NewPFD(), new VariableByte()), - new SkippableComposition(new NewPFDS9(), new VariableByte()), - new SkippableComposition(new NewPFDS16(), new VariableByte()), - new SkippableComposition(new OptPFD(), new VariableByte()), - new SkippableComposition(new OptPFDS9(), new VariableByte()), - new SkippableComposition(new OptPFDS16(), new VariableByte()), - new SkippableComposition(new FastPFOR128(), new VariableByte()), - new SkippableComposition(new FastPFOR(), new VariableByte()), new Simple9(), new Simple16() }; + SkippableIntegerCODEC[] codecs = { new JustCopy(), new VariableByte(), + new SkippableComposition(new BinaryPacking(), new VariableByte()), + new SkippableComposition(new NewPFD(), new VariableByte()), + new SkippableComposition(new NewPFDS9(), new VariableByte()), + new SkippableComposition(new NewPFDS16(), new VariableByte()), + new SkippableComposition(new OptPFD(), new VariableByte()), + new SkippableComposition(new OptPFDS9(), new VariableByte()), + new SkippableComposition(new OptPFDS16(), new VariableByte()), + new SkippableComposition(new FastPFOR128(), new VariableByte()), + new SkippableComposition(new FastPFOR(), new VariableByte()), new Simple9(), new Simple16() }; - /** - * @throws IOException - * if the resource cannot be accessed (should be considered a - * bug) - * - */ - @Test - public void IntCompressorTest() throws IOException { - // next line requires Java8? - // int[] data = - // Files.lines(Paths.get("integers.txt")).mapToInt(Integer::parseInt).toArray(); - File f = new File("src/test/resources/integers.txt"); - System.out.println("loading test data from "+ f.getAbsolutePath()); - BufferedReader bfr = new BufferedReader(new FileReader(f)); - String line; - ArrayList