diff --git a/.github/release-settings.xml b/.github/release-settings.xml
new file mode 100644
index 0000000..be56a53
--- /dev/null
+++ b/.github/release-settings.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 https://maven.apache.org/xsd/settings-1.0.0.xsd">
+    <pluginGroups>
+        <pluginGroup>eu.maveniverse.maven.plugins</pluginGroup>
+    </pluginGroups>
+
+    <servers>
+        <server>
+            <id>sonatype-central-portal</id>
+            <username>${env.MAVEN_USER}</username>
+            <password>${env.MAVEN_PASSWORD}</password>
+            <configuration>
+                <njord.publisher>sonatype-cp</njord.publisher>
+                <njord.releaseUrl>njord:template:release-sca</njord.releaseUrl>
+            </configuration>
+        </server>
+    </servers>
+
+</settings>
diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml
new file mode 100644
index 0000000..7f12ed7
--- /dev/null
+++ b/.github/workflows/basic.yml
@@ -0,0 +1,24 @@
+name: Java CI
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        java: [ 21 ]
+    steps:
+      - uses: actions/checkout@v4.1.1
+      - name: Set up JDK
+        uses: actions/setup-java@v4.1.0
+        with:
+          java-version: ${{ matrix.java }}
+          distribution: 'adopt'
+      - name: Build and test with Maven
+        run: mvn package
+      - name: Build example
+        run: javac -cp target/classes/:. example.java
+      - name: Run example
+        run: java -cp target/classes/:. example
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..d6ad167
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,90 @@
+name: Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      releaseVersion:
+        description: "Release version, e.g. 0.3.6 (optional — auto-detected from the current POM)"
+        required: false
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write # to automatically create tags
+
+    steps:
+      - name: Validate release version
+        if: ${{ github.event.inputs.releaseVersion != '' }}
+        run: |
+          RELEASE=${{ github.event.inputs.releaseVersion }}
+          if [[ ! $RELEASE =~ ^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$ ]]; then
+            echo "Error: releaseVersion '$RELEASE' is not in the correct format x.y.z or x.y.z-SNAPSHOT"
+            exit 1
+          fi
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: master
+
+      - name: Set up Java
+        uses: actions/setup-java@v4
+        with:
+          java-version: '21'
+          distribution: 'adopt'
+          gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
+          gpg-passphrase: MAVEN_GPG_PASSPHRASE
+
+      - name: Configure git
+        run: |
+          git config user.email "actions@github.com"
+          git config user.name "GitHub Actions"
+
+      - name: Prepare Release
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+        run: |
+          MVN_ARGS=""
+          if [ -n "${{ github.event.inputs.releaseVersion }}" ]; then
+            MVN_ARGS="$MVN_ARGS -DreleaseVersion=${{ github.event.inputs.releaseVersion }}"
+          fi
+          mvn -B release:prepare $MVN_ARGS
+
+      - name: Check release.properties
+        run: |
+          if [ ! -f release.properties ]; then
+            echo "release.properties not found"
+            exit 1
+          fi
+          echo "Contents of release.properties:"
+          cat release.properties
+
+      - name: Determine release version
+        id: version
+        run: |
+          export TAG=$(grep 'scm.tag=' release.properties | cut -d'=' -f2)
+          export VERSION=${TAG#JavaFastPFOR-}
+
+          echo "released_tag=${TAG}" >> $GITHUB_OUTPUT
+          echo "released_version=${VERSION}" >> $GITHUB_OUTPUT
+          
+          echo "Releasing tag: ${TAG}"
+          echo "Releasing version: ${VERSION}"
+
+      - name: Release
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+          MAVEN_GPG_KEY: ${{ secrets.GPG_PRIVATE_KEY }}
+          MAVEN_USER: ${{ secrets.MAVEN_USER }}
+          MAVEN_PASSWORD: ${{ secrets.MAVEN_PASSWORD }}
+        run: |
+          mvn -B release:perform -Darguments="-DskipTests -DaltDeploymentRepository=id::default::njord: -Dnjord.autoPublish=true -Dnjord.publishingType=automatic" -s .github/release-settings.xml
+
+      - name: Create GitHub Release
+        run: gh release create "${{ steps.version.outputs.released_tag }}" --generate-notes --title "Version ${{ steps.version.outputs.released_version }}"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 53960d2..5a78c84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .classpath
+.settings
 .project
 *.class
 *.csv
@@ -6,3 +7,5 @@ tags
 target/
 tmp/
 /bin
+.idea
+*.iml
diff --git a/README.md b/README.md
index 000695c..0246789 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,10 @@
 JavaFastPFOR: A simple integer compression library in Java 
 ==========================================================
- [![][maven img]][maven] [![][license img]][license] [![docs-badge][]][docs]
-[![Code Quality: Cpp](https://img.shields.io/lgtm/grade/java/g/lemire/JavaFastPFOR.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/lemire/JavaFastPFOR/context:java)
+[![](https://jitpack.io/v/fast-pack/JavaFastPFor.svg)](https://jitpack.io/#fast-pack/JavaFastPFor) [![][license img]][license] [![docs-badge][]][docs]
+[![Java CI](https://github.com/lemire/JavaFastPFOR/actions/workflows/basic.yml/badge.svg)](https://github.com/lemire/JavaFastPFOR/actions/workflows/basic.yml)
 
 
 
-License
--------
-
-This code is released under the
-Apache License Version 2.0 http://www.apache.org/licenses/.
-
 
 What does this do?
 ------------------
@@ -49,13 +43,50 @@ as well as in GMAP and GSNAP (http://research-pub.gene.com/gmap/).
 Usage
 ------
 
-Really simple usage:
 
 ```java
-        IntegratedIntCompressor iic = new IntegratedIntCompressor();
-        int[] data = ... ; // to be compressed
-        int[] compressed = iic.compress(data); // compressed array
-        int[] recov = iic.uncompress(compressed); // equals to data
+package org.example;
+
+import me.lemire.integercompression.FastPFOR128;
+import me.lemire.integercompression.IntWrapper;
+
+import java.util.Arrays;
+
+public class Main {
+    public static void main(String[] args) {
+        FastPFOR128 fastpfor = new FastPFOR128();
+
+        int N = 9984;
+        int[] data = new int[N];
+        for (var i = 0; i < N; i += 150) {
+            data[i] = i;
+        }
+
+        int[] compressedoutput1 = new int[N + 1024];
+
+        IntWrapper inputoffset1 = new IntWrapper(0);
+        IntWrapper outputoffset1 = new IntWrapper(0);
+
+        fastpfor.compress(data, inputoffset1, N, compressedoutput1, outputoffset1);
+        int compressedsize1 = outputoffset1.get();
+
+        int[] recovered1 = new int[N];
+        inputoffset1 = new IntWrapper(0);
+        outputoffset1 = new IntWrapper(0);
+        fastpfor.uncompress(compressedoutput1, outputoffset1, compressedsize1, recovered1, inputoffset1);
+
+        // quick verification: count mismatches
+        int mismatches = 0;
+        for (int i = 0; i < N; i++) {
+            if (data[i] != recovered1[i]) mismatches++;
+        }
+
+        System.out.println("N=" + N + " compressedSizeWords=" + compressedsize1 + " mismatches=" + mismatches);
+        System.out.println("first 20 original: " + Arrays.toString(Arrays.copyOf(data, 20)));
+        System.out.println("first 20 recovered: " + Arrays.toString(Arrays.copyOf(recovered1, 20)));
+    }
+}
+
 ```
 
 For more examples, see example.java or the examples folder.
@@ -67,38 +98,83 @@ in sorted orders and use differential coding (they compress deltas).
 They can be found in the package me.lemire.integercompression.differential.
 Most others do not.
 
+The Java Team at Intel (R) introduced the vector implementation for FastPFOR
+based on the Java Vector API that showed significant gains over the
+non-vectorized implementation. For an example usage, see
+examples/vector/Example.java. The feature requires JDK 19+ and is currently for 
+advanced users.
 
-Maven central repository
+JavaFastPFOR as a dependency
 ------------------------
 
-Using this code in your own project is easy with maven, just add
-the following code in your pom.xml file:
+JavaFastPFOR is available both on Maven Central and JitPack, so you can easily 
+include it in your project using either source.
+
+We have a demo project using JavaFastPFOR as a dependency (both Maven and Gradle). See...
+
+https://github.com/fast-pack/JavaFastPFORDemo
+
+### Maven Central
+
+You can add JavaFastPFOR directly from Maven Central — no extra repository configuration needed:
+
+**Maven**
 
 ```xml
-    <dependencies>
-         <dependency>
-	     <groupId>me.lemire.integercompression</groupId>
-	     <artifactId>JavaFastPFOR</artifactId>
-	     <version>[0.1,)</version>
-         </dependency>
-     </dependencies>
+<dependency>
+    <groupId>me.lemire.integercompression</groupId>
+    <artifactId>JavaFastPFOR</artifactId>
+    <version>0.3.8</version>
+</dependency>
 ```
 
-Naturally, you should replace "version" by the version
-you desire.
+**Gradle (Groovy)**
 
+```groovy
+dependencies {
+    implementation 'me.lemire.integercompression:JavaFastPFOR:0.3.8'
+}
+```
 
+### JitPack
 
-You can also download JavaFastPFOR from the Maven central repository:
-http://repo1.maven.org/maven2/me/lemire/integercompression/JavaFastPFOR/
+If you prefer or need to use JitPack, you can include the dependency like this:
 
+**Maven**
 
-Why?
-----
+```xml
+<repositories>
+    <repository>
+        <id>jitpack.io</id>
+        <url>https://jitpack.io</url>
+    </repository>
+</repositories>
+
+<dependency>
+    <groupId>com.github.fast-pack</groupId>
+    <artifactId>JavaFastPFOR</artifactId>
+    <version>JavaFastPFOR-0.3.8</version>
+</dependency>
+```
+
+**Gradle (groovy)**
+
+```groovy
+repositories {
+    mavenCentral()
+    maven {
+        url 'https://jitpack.io'
+    }
+}
+
+dependencies {
+    implementation 'com.github.fast-pack:JavaFastPFOR:JavaFastPFOR-0.3.8'
+}
+```
+
+Naturally, you should replace "version" by the version
+you desire.
 
-We found no library that implemented state-of-the-art integer coding techniques
-such as Binary Packing, NewPFD, OptPFD, Variable Byte, Simple 9 and so on in Java.
-We wrote one. 
 
 Thread safety 
 ----
@@ -111,19 +187,6 @@ Nevertheless, if you want to reuse codec instances,
 note that by convention, unless the documentation of a codec specify
 that it is not thread-safe, then it can be assumed to be thread-safe.
 
-Authors
--------
-
-Main contributors
-* Daniel Lemire, http://lemire.me/en/
-* Muraoka Taro, https://github.com/koron
-
-with contributions by 
-* the Terrier team (Matteo Catena, Craig Macdonald, Saúl Vargas and Iadh Ounis)
-* Di Wu, http://www.facebook.com/diwu1989
-* Stefan Ackermann, https://github.com/Stivo
-* Samit Roy, https://github.com/roysamit
-
 How does it compare to the Kamikaze PForDelta library?
 ------------------------------------------------------
 
@@ -141,19 +204,16 @@ Reference:
 Requirements
 ------------
 
-A recent Java compiler. Java 7 or better is recommended.
+Releases up to 0.1.12 require Java 7 or better.
 
-Good instructions on installing Java 7 on Linux:
+The current development versions assume JDK 21 or better.
 
-http://forums.linuxmint.com/viewtopic.php?f=42&t=93052
 
 
 How fast is it?
 ---------------
 
-Compile the code and execute me.lemire.integercompression.benchmarktools.Benchmark.
-
-I recommend running all the benchmarks with the "-server" flag on a desktop machine.
+Compile the code and execute `me.lemire.integercompression.benchmarktools.Benchmark`.
 
 Speed is always reported in millions of integers per second.
 
@@ -161,11 +221,21 @@ Speed is always reported in millions of integers per second.
 For Maven users
 ---------------
 
-mvn compile
 
+```
+mvn compile
 mvn exec:java
+```
+
+You may run our examples as follows:
+
+```
+mvn package
+javac -cp target/classes/:. example.java
+java -cp target/classes/:. example
+```
 
-For ant users
+For ant users (legacy, currently untested)
 -------------
 
 If you use Apache ant, please try this:
@@ -182,6 +252,21 @@ API Documentation
 
 http://www.javadoc.io/doc/me.lemire.integercompression/JavaFastPFOR/
 
+
+Citing this work
+-----------------
+
+If you use JavaFastPFOR in your work, please consider citing the project. A recommended BibTeX entry is:
+
+```bibtex
+@misc{lemire2025_javafastpfor,
+  author = {Daniel Lemire},
+  title = {{JavaFastPFOR: A simple integer compression library in Java}},
+  year = {2025},
+  howpublished = {\url{https://github.com/fast-pack/JavaFastPFOR}},
+}
+```
+
 Want to read more?
 ------------------
 
@@ -202,13 +287,14 @@ We wrote several research papers documenting many of the CODECs implemented here
 Ikhtear Sharif wrote his M.Sc. thesis on this library:
 
 Ikhtear Sharif, Performance Evaluation of Fast Integer Compression Techniques Over Tables, M.Sc. thesis, UNB 2013.
-http://lemire.me/fr/documents/thesis/IkhtearThesis.pdf
+https://unbscholar.lib.unb.ca/islandora/object/unbscholar%3A9399/datastream/PDF/view
 
 He also posted his slides online: http://www.slideshare.net/ikhtearSharif/ikhtear-defense
 
 Other recommended libraries
 -----------------------------
 
+* Fast integer compression in Go: https://github.com/ronanh/intcomp
 * Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
 * CSharpFastPFOR: A C#  integer compression library  https://github.com/Genbox/CSharpFastPFOR
 * TurboPFor is a C library that offers lots of interesting optimizations and Java wrappers. Well worth checking! (Uses a GPL license.) https://github.com/powturbo/TurboPFor
@@ -219,8 +305,6 @@ Funding
 This work was supported by NSERC grant number 26143.
 
 
-[maven img]:https://maven-badges.herokuapp.com/maven-central/me.lemire.integercompression/JavaFastPFOR/badge.svg
-[maven]:http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22me.lemire.integercompression%22%20
 
 [license]:LICENSE
 [license img]:https://img.shields.io/badge/License-Apache%202-blue.svg
diff --git a/benchmarkresults/benchmarkresults_haswell_18sept2014.txt b/benchmarkresults/benchmarkresults_haswell_18sept2014.txt
index 43fa98b..a501d5d 100644
--- a/benchmarkresults/benchmarkresults_haswell_18sept2014.txt
+++ b/benchmarkresults/benchmarkresults_haswell_18sept2014.txt
@@ -1,7 +1,7 @@
 # benchmark based on the ClusterData model from:
-# 	 Vo Ngoc Anh and Alistair Moffat. 
-#	 Index compression using 64-bit words.
-# 	 Softw. Pract. Exper.40, 2 (February 2010), 131-147. 
+#      Vo Ngoc Anh and Alistair Moffat. 
+#     Index compression using 64-bit words.
+#      Softw. Pract. Exper.40, 2 (February 2010), 131-147. 
 
 # Results will be written into a CSV file: benchmark-20140918T011257.csv
 
@@ -10,852 +10,852 @@
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.56	246	1061
+    2.56    246    1061
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.21	66	275
+    3.21    66    275
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.96	838	1679
+    2.96    838    1679
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1276	1805
+    32.00    1276    1805
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	490	509
+    8.00    490    509
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	582	774
+    8.00    582    774
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.96	765	1193
+    2.96    765    1193
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.88	139	896
+    2.88    139    896
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.90	166	905
+    2.90    166    905
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.88	139	898
+    2.88    139    898
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.87	25	938
+    2.87    25    938
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.90	29	960
+    2.90    29    960
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.87	25	882
+    2.87    25    882
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.63	274	1015
+    2.63    274    1015
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.83	280	771
+    2.83    280    771
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.84	444	837
+    2.84    444    837
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.27	498	652
+    3.27    498    652
 
 # sparsity 2
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.51	244	1048
+    3.51    244    1048
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.18	55	247
+    4.18    55    247
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.93	862	1611
+    3.93    862    1611
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1286	1816
+    32.00    1286    1816
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.01	486	508
+    8.01    486    508
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.01	575	763
+    8.01    575    763
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.93	774	1159
+    3.93    774    1159
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.83	118	865
+    3.83    118    865
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.86	141	875
+    3.86    141    875
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.83	118	867
+    3.83    118    867
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	18	881
+    3.82    18    881
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.85	22	887
+    3.85    22    887
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	18	838
+    3.82    18    838
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.58	273	990
+    3.58    273    990
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	201	656
+    3.82    201    656
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.90	442	819
+    3.90    442    819
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.50	494	640
+    4.50    494    640
 
 # sparsity 3
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.28	244	1030
+    4.28    244    1030
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.95	51	247
+    4.95    51    247
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.71	850	1577
+    4.71    850    1577
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1252	1769
+    32.00    1252    1769
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	478	504
+    8.02    478    504
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	573	762
+    8.02    573    762
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.71	770	1139
+    4.71    770    1139
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.60	107	850
+    4.60    107    850
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.63	127	863
+    4.63    127    863
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.60	107	853
+    4.60    107    853
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.59	14	865
+    4.59    14    865
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.62	18	882
+    4.62    18    882
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.59	14	844
+    4.59    14    844
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.34	268	969
+    4.34    268    969
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.72	170	610
+    4.72    170    610
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.68	434	783
+    4.68    434    783
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.33	472	624
+    5.33    472    624
 
 # sparsity 4
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.03	239	1004
+    5.03    239    1004
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.73	47	251
+    5.73    47    251
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.48	846	1556
+    5.48    846    1556
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1274	1799
+    32.00    1274    1799
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.12	439	486
+    8.12    439    486
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.12	537	715
+    8.12    537    715
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.48	769	1134
+    5.48    769    1134
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.36	95	817
+    5.36    95    817
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.39	115	838
+    5.39    115    838
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.36	96	827
+    5.36    96    827
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.34	12	842
+    5.34    12    842
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.37	16	871
+    5.37    16    871
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.34	12	803
+    5.34    12    803
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.09	268	963
+    5.09    268    963
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.57	150	587
+    5.57    150    587
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.47	432	800
+    5.47    432    800
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.16	491	635
+    6.16    491    635
 
 # sparsity 5
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.00	236	999
+    6.00    236    999
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.70	43	242
+    6.70    43    242
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.45	863	1584
+    6.45    863    1584
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1236	1792
+    32.00    1236    1792
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.40	369	452
+    8.40    369    452
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.40	486	617
+    8.40    486    617
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.45	777	1132
+    6.45    777    1132
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.33	86	808
+    6.33    86    808
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.36	103	828
+    6.36    103    828
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.33	86	813
+    6.33    86    813
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.31	9	825
+    6.31    9    825
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.34	13	858
+    6.34    13    858
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.31	9	819
+    6.31    9    819
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.06	265	945
+    6.06    265    945
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.65	139	546
+    6.65    139    546
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.45	442	804
+    6.45    442    804
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.18	493	635
+    7.18    493    635
 
 # sparsity 6
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.76	238	998
+    6.76    238    998
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.45	42	251
+    7.45    42    251
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.20	854	1525
+    7.20    854    1525
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1177	1663
+    32.00    1177    1663
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.10	259	362
+    9.10    259    362
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.10	380	450
+    9.10    380    450
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.20	718	1098
+    7.20    718    1098
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.08	79	786
+    7.08    79    786
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.11	95	821
+    7.11    95    821
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.08	81	814
+    7.08    81    814
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.06	8	836
+    7.06    8    836
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.09	11	860
+    7.09    11    860
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.06	8	822
+    7.06    8    822
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.81	268	962
+    6.81    268    962
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.56	129	509
+    7.56    129    509
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.19	433	789
+    7.19    433    789
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.93	491	632
+    7.93    491    632
 
 # sparsity 7
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.05	236	985
+    8.05    236    985
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.75	39	247
+    8.75    39    247
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.50	861	1526
+    8.50    861    1526
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1279	1788
+    32.00    1279    1788
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.11	190	305
+    10.11    190    305
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.11	311	355
+    10.11    311    355
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.50	753	1092
+    8.50    753    1092
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.37	71	792
+    8.37    71    792
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.40	83	804
+    8.40    83    804
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.37	72	805
+    8.37    72    805
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.35	7	808
+    8.35    7    808
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.38	10	835
+    8.38    10    835
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.35	7	796
+    8.35    7    796
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.10	259	920
+    8.10    259    920
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.16	111	447
+    9.16    111    447
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.52	435	784
+    8.52    435    784
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.32	485	622
+    9.32    485    622
 
 # sparsity 8
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.73	234	972
+    8.73    234    972
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.44	37	250
+    9.44    37    250
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.19	848	1493
+    9.19    848    1493
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1279	1858
+    32.00    1279    1858
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.04	167	307
+    11.04    167    307
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.04	309	353
+    11.04    309    353
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.19	751	1095
+    9.19    751    1095
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.06	67	770
+    9.06    67    770
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.09	78	781
+    9.09    78    781
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.06	68	792
+    9.06    68    792
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.03	6	795
+    9.03    6    795
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.07	9	824
+    9.07    9    824
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.03	6	787
+    9.03    6    787
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.78	266	936
+    8.78    266    936
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.34	101	427
+    10.34    101    427
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.21	437	794
+    9.21    437    794
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.01	488	626
+    10.01    488    626
 
 # sparsity 9
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.75	234	980
+    9.75    234    980
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.48	36	242
+    10.48    36    242
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.21	844	1474
+    10.21    844    1474
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1262	1795
+    32.00    1262    1795
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.18	145	300
+    12.18    145    300
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.18	302	340
+    12.18    302    340
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.21	761	1096
+    10.21    761    1096
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.08	63	786
+    10.08    63    786
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.12	72	752
+    10.12    72    752
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.08	63	783
+    10.08    63    783
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.05	6	787
+    10.05    6    787
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.09	8	798
+    10.09    8    798
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.05	6	779
+    10.05    6    779
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.80	264	930
+    9.80    264    930
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.77	92	410
+    11.77    92    410
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.23	438	789
+    10.23    438    789
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.05	486	624
+    11.05    486    624
 
 # sparsity 10
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.73	235	979
+    10.73    235    979
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.46	35	239
+    11.46    35    239
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.18	840	1456
+    11.18    840    1456
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1248	1746
+    32.00    1248    1746
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.14	135	312
+    13.14    135    312
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.14	309	354
+    13.14    309    354
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.18	761	1097
+    11.18    761    1097
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.09	59	802
+    11.09    59    802
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.13	69	814
+    11.13    69    814
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.09	59	771
+    11.09    59    771
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.04	5	783
+    11.04    5    783
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.08	8	816
+    11.08    8    816
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.04	5	776
+    11.04    5    776
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.78	265	934
+    10.78    265    934
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.98	89	415
+    12.98    89    415
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.20	436	787
+    11.20    436    787
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.02	483	620
+    12.02    483    620
 
 # sparsity 11
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.79	232	950
+    11.79    232    950
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.68	34	256
+    12.68    34    256
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.23	842	1450
+    12.23    842    1450
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1292	1826
+    32.00    1292    1826
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.00	127	324
+    14.00    127    324
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.00	308	369
+    14.00    308    369
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.23	760	1092
+    12.23    760    1092
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.35	56	795
+    12.35    56    795
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.38	65	829
+    12.38    65    829
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.35	57	822
+    12.35    57    822
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.13	5	706
+    12.13    5    706
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.17	7	750
+    12.17    7    750
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.13	5	712
+    12.13    5    712
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.83	261	919
+    11.83    261    919
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.17	85	401
+    14.17    85    401
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.25	436	781
+    12.25    436    781
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.08	489	623
+    13.08    489    623
 
 # sparsity 12
 # generating random data...
 # generating random data... ok.
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.70	226	932
+    12.70    226    932
 
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.76	34	261
+    13.76    34    261
 
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.16	849	1453
+    13.16    849    1453
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1208	1804
+    32.00    1208    1804
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.84	117	307
+    14.84    117    307
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.84	260	352
+    14.84    260    352
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.16	762	1095
+    13.16    762    1095
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.46	56	899
+    13.46    56    899
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.48	63	915
+    13.48    63    915
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.46	56	897
+    13.46    56    897
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.20	5	681
+    13.20    5    681
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.24	7	735
+    13.24    7    735
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.20	5	699
+    13.20    5    699
 
 # IntegratedFastPFOR + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.75	260	914
+    12.75    260    914
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	15.51	80	359
+    15.51    80    359
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.18	435	781
+    13.18    435    781
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.00	489	626
+    14.00    489    626
 
 
 Results were written into a CSV file: benchmark-20140918T011257.csv
diff --git a/benchmarkresults/benchmarkresults_icore7_10may2013.txt b/benchmarkresults/benchmarkresults_icore7_10may2013.txt
index 5b776fb..d10579e 100644
--- a/benchmarkresults/benchmarkresults_icore7_10may2013.txt
+++ b/benchmarkresults/benchmarkresults_icore7_10may2013.txt
@@ -3,610 +3,610 @@
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.34	51	262
+    3.34    51    262
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.09	639	1183
+    3.09    639    1183
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1151	1468
+    32.00    1151    1468
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	199	299
+    8.00    199    299
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	148	339
+    8.00    148    339
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.09	613	920
+    3.09    613    920
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.02	143	721
+    3.02    143    721
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.05	164	705
+    3.05    164    705
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.01	26	790
+    3.01    26    790
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.03	30	816
+    3.03    30    816
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.78	226	811
+    2.78    226    811
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.97	236	604
+    2.97    236    604
 
 # sparsity 2
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.17	47	266
+    4.17    47    266
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.92	672	1261
+    3.92    672    1261
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1218	1562
+    32.00    1218    1562
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	204	290
+    8.00    204    290
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	236	343
+    8.00    236    343
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.92	505	917
+    3.92    505    917
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	127	698
+    3.82    127    698
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.85	151	726
+    3.85    151    726
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.81	18	752
+    3.81    18    752
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.84	23	779
+    3.84    23    779
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.56	228	828
+    3.56    228    828
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	182	562
+    3.82    182    562
 
 # sparsity 3
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.96	43	276
+    4.96    43    276
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.72	662	1187
+    4.72    662    1187
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1215	1566
+    32.00    1215    1566
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	198	286
+    8.02    198    286
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	254	340
+    8.02    254    340
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.72	576	848
+    4.72    576    848
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.61	111	654
+    4.61    111    654
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.64	129	699
+    4.64    129    699
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.60	14	732
+    4.60    14    732
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.63	18	761
+    4.63    18    761
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.36	226	813
+    4.36    226    813
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.76	148	511
+    4.76    148    511
 
 # sparsity 4
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.97	39	270
+    5.97    39    270
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.72	656	1148
+    5.72    656    1148
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1212	1555
+    32.00    1212    1555
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.09	206	287
+    8.09    206    287
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.09	213	334
+    8.09    213    334
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.72	626	891
+    5.72    626    891
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.60	105	672
+    5.60    105    672
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.63	121	701
+    5.63    121    701
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.58	10	667
+    5.58    10    667
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.62	14	736
+    5.62    14    736
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.33	226	805
+    5.33    226    805
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.86	123	464
+    5.86    123    464
 
 # sparsity 5
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.49	39	262
+    6.49    39    262
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.25	659	1121
+    6.25    659    1121
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1154	1168
+    32.00    1154    1168
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.44	192	265
+    8.44    192    265
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.44	240	297
+    8.44    240    297
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.25	631	907
+    6.25    631    907
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.13	101	685
+    6.13    101    685
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.16	116	714
+    6.16    116    714
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.11	9	708
+    6.11    9    708
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.14	13	741
+    6.14    13    741
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.86	225	806
+    5.86    225    806
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.44	120	442
+    6.44    120    442
 
 # sparsity 6
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.64	35	269
+    7.64    35    269
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.39	654	1111
+    7.39    654    1111
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1207	1553
+    32.00    1207    1553
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.06	185	225
+    9.06    185    225
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.06	166	248
+    9.06    166    248
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.39	620	888
+    7.39    620    888
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.26	91	679
+    7.26    91    679
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.30	104	704
+    7.30    104    704
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.24	7	704
+    7.24    7    704
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.28	10	735
+    7.28    10    735
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.00	221	792
+    7.00    221    792
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.76	106	393
+    7.76    106    393
 
 # sparsity 7
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.66	33	266
+    8.66    33    266
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.41	675	1165
+    8.41    675    1165
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1210	1553
+    32.00    1210    1553
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.10	154	194
+    10.10    154    194
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.10	176	207
+    10.10    176    207
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.41	628	896
+    8.41    628    896
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.27	84	643
+    8.27    84    643
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.31	95	685
+    8.31    95    685
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.25	6	693
+    8.25    6    693
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.29	9	723
+    8.29    9    723
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	215	773
+    8.00    215    773
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.10	94	357
+    9.10    94    357
 
 # sparsity 8
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.52	32	241
+    9.52    32    241
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.26	692	1194
+    9.26    692    1194
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1208	1525
+    32.00    1208    1525
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.14	138	178
+    11.14    138    178
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.14	187	190
+    11.14    187    190
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.26	647	893
+    9.26    647    893
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.14	79	655
+    9.14    79    655
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.18	88	684
+    9.18    88    684
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.11	6	680
+    9.11    6    680
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.15	8	712
+    9.15    8    712
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.86	220	778
+    8.86    220    778
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.39	86	330
+    10.39    86    330
 
 # sparsity 9
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.46	31	253
+    10.46    31    253
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.19	661	1122
+    10.19    661    1122
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1210	1546
+    32.00    1210    1546
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.27	126	173
+    12.27    126    173
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.27	155	181
+    12.27    155    181
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.19	617	886
+    10.19    617    886
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.07	73	634
+    10.07    73    634
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.11	82	669
+    10.11    82    669
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.04	5	663
+    10.04    5    663
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.08	7	700
+    10.08    7    700
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.79	215	757
+    9.79    215    757
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.79	78	325
+    11.79    78    325
 
 # sparsity 10
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.13	30	243
+    11.13    30    243
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.83	628	1028
+    10.83    628    1028
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1167	1498
+    32.00    1167    1498
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.71	152	179
+    12.71    152    179
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.71	151	187
+    12.71    151    187
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.83	389	820
+    10.83    389    820
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.76	72	638
+    10.76    72    638
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.79	79	683
+    10.79    79    683
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.69	5	655
+    10.69    5    655
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.73	7	682
+    10.73    7    682
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.42	219	767
+    10.42    219    767
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.49	77	332
+    12.49    77    332
 
 # sparsity 11
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.38	29	254
+    12.38    29    254
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.01	660	1112
+    12.01    660    1112
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1142	1445
+    32.00    1142    1445
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.87	143	172
+    13.87    143    172
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.87	144	178
+    13.87    144    178
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.01	582	830
+    12.01    582    830
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.04	64	647
+    12.04    64    647
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.08	69	649
+    12.08    69    649
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.91	4	637
+    11.91    4    637
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.95	7	660
+    11.95    7    660
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.61	217	766
+    11.61    217    766
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.96	73	313
+    13.96    73    313
 
 # sparsity 12
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.57	29	265
+    13.57    29    265
 
 # me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.01	611	1012
+    13.01    611    1012
 
 # me.lemire.integercompression.JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1215	1565
+    32.00    1215    1565
 
 # me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.73	121	160
+    14.73    121    160
 
 # me.lemire.integercompression.IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.73	131	166
+    14.73    131    166
 
 # me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.01	603	832
+    13.01    603    832
 
 # me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.26	68	737
+    13.26    68    737
 
 # me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.29	72	761
+    13.29    72    761
 
 # me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.06	4	591
+    13.06    4    591
 
 # me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.10	6	625
+    13.10    6    625
 
 # me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.61	213	755
+    12.61    213    755
 
 # me.lemire.integercompression.Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	15.38	69	281
+    15.38    69    281
 
diff --git a/benchmarkresults/benchmarkresults_icore7_12november2013.txt b/benchmarkresults/benchmarkresults_icore7_12november2013.txt
index 07b11b3..795650e 100644
--- a/benchmarkresults/benchmarkresults_icore7_12november2013.txt
+++ b/benchmarkresults/benchmarkresults_icore7_12november2013.txt
@@ -10,9 +10,9 @@ Its dependencies (if any) will NOT be available to the current build.
 [INFO] [enforcer:enforce {execution: enforce-maven}]
 [INFO] [exec:java {execution: default-cli}]
 # benchmark based on the ClusterData model from:
-# 	 Vo Ngoc Anh and Alistair Moffat. 
-#	 Index compression using 64-bit words.
-# 	 Softw. Pract. Exper.40, 2 (February 2010), 131-147. 
+#      Vo Ngoc Anh and Alistair Moffat. 
+#     Index compression using 64-bit words.
+#      Softw. Pract. Exper.40, 2 (February 2010), 131-147. 
 
 # Results will be written into a CSV file: benchmark-20131112T105209.csv
 
@@ -21,852 +21,852 @@ Its dependencies (if any) will NOT be available to the current build.
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.28	48	218
+    3.28    48    218
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.03	623	1205
+    3.03    623    1205
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1154	1331
+    32.00    1154    1331
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	508	554
+    8.00    508    554
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	592	709
+    8.00    592    709
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.03	596	900
+    3.03    596    900
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.95	115	701
+    2.95    115    701
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.98	135	726
+    2.98    135    726
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.95	116	726
+    2.95    116    726
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.94	19	761
+    2.94    19    761
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.97	22	767
+    2.97    22    767
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.94	19	765
+    2.94    19    765
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.71	219	797
+    2.71    219    797
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.71	217	813
+    2.71    217    813
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.90	254	599
+    2.90    254    599
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.92	375	669
+    2.92    375    669
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.36	394	503
+    3.36    394    503
 
 # sparsity 2
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.09	47	254
+    4.09    47    254
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.84	636	1160
+    3.84    636    1160
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1152	1264
+    32.00    1152    1264
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.01	510	551
+    8.01    510    551
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.01	594	704
+    8.01    594    704
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.84	602	875
+    3.84    602    875
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.74	101	673
+    3.74    101    673
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.77	117	695
+    3.77    117    695
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.74	101	694
+    3.74    101    694
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.73	15	725
+    3.73    15    725
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.76	18	741
+    3.76    18    741
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.73	15	731
+    3.73    15    731
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.48	215	782
+    3.48    215    782
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.48	212	789
+    3.48    212    789
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.72	190	530
+    3.72    190    530
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.76	375	657
+    3.76    375    657
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.32	392	499
+    4.32    392    499
 
 # sparsity 3
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.03	42	250
+    5.03    42    250
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.77	643	1141
+    4.77    643    1141
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1149	1337
+    32.00    1149    1337
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	506	547
+    8.02    506    547
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	590	698
+    8.02    590    698
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.77	619	904
+    4.77    619    904
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.66	89	640
+    4.66    89    640
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.69	103	672
+    4.69    103    672
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.66	89	668
+    4.66    89    668
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.64	12	700
+    4.64    12    700
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.68	14	712
+    4.68    14    712
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.64	12	704
+    4.64    12    704
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.39	212	762
+    4.39    212    762
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.39	209	763
+    4.39    209    763
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.81	146	480
+    4.81    146    480
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.75	373	646
+    4.75    373    646
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.40	386	496
+    5.40    386    496
 
 # sparsity 4
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.77	39	245
+    5.77    39    245
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.53	629	1095
+    5.53    629    1095
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1148	1332
+    32.00    1148    1332
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.11	482	522
+    8.11    482    522
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.11	557	655
+    8.11    557    655
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.53	617	889
+    5.53    617    889
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.42	82	659
+    5.42    82    659
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.45	94	684
+    5.45    94    684
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.42	82	686
+    5.42    82    686
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.40	10	695
+    5.40    10    695
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.43	12	715
+    5.43    12    715
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.40	10	705
+    5.40    10    705
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.16	214	776
+    5.16    214    776
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.16	211	780
+    5.16    211    780
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.66	128	457
+    5.66    128    457
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.53	370	645
+    5.53    370    645
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.23	389	493
+    6.23    389    493
 
 # sparsity 5
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.57	37	248
+    6.57    37    248
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.32	640	1113
+    6.32    640    1113
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1150	1349
+    32.00    1150    1349
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.41	416	456
+    8.41    416    456
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.41	473	548
+    8.41    473    548
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.32	622	898
+    6.32    622    898
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.20	75	643
+    6.20    75    643
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.23	86	668
+    6.23    86    668
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.20	75	666
+    6.20    75    666
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.18	8	690
+    6.18    8    690
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.21	11	705
+    6.21    11    705
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.18	8	697
+    6.18    8    697
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.93	211	741
+    5.93    211    741
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.93	208	772
+    5.93    208    772
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.51	118	426
+    6.51    118    426
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.32	374	639
+    6.32    374    639
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.05	390	492
+    7.05    390    492
 
 # sparsity 6
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.73	35	242
+    7.73    35    242
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.48	630	1071
+    7.48    630    1071
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1143	1350
+    32.00    1143    1350
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.04	328	365
+    9.04    328    365
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.04	365	415
+    9.04    365    415
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.48	620	882
+    7.48    620    882
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.36	67	641
+    7.36    67    641
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.39	76	668
+    7.39    76    668
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.36	67	667
+    7.36    67    667
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.33	7	679
+    7.33    7    679
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.37	9	695
+    7.37    9    695
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.33	7	686
+    7.33    7    686
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.09	211	749
+    7.09    211    749
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.09	208	764
+    7.09    208    764
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.88	101	383
+    7.88    101    383
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.49	372	630
+    7.49    372    630
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.27	389	489
+    8.27    389    489
 
 # sparsity 7
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.46	33	244
+    8.46    33    244
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.21	628	1052
+    8.21    628    1052
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1148	1334
+    32.00    1148    1334
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.01	257	290
+    10.01    257    290
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.01	284	315
+    10.01    284    315
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.21	612	859
+    8.21    612    859
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.08	63	626
+    8.08    63    626
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.11	71	665
+    8.11    71    665
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.08	63	663
+    8.08    63    663
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.06	6	675
+    8.06    6    675
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.09	8	687
+    8.09    8    687
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.06	6	682
+    8.06    6    682
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.81	210	756
+    7.81    210    756
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.81	207	759
+    7.81    207    759
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.85	92	353
+    8.85    92    353
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.22	369	622
+    8.22    369    622
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.00	389	486
+    9.00    389    486
 
 # sparsity 8
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.41	32	234
+    9.41    32    234
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.16	636	1062
+    9.16    636    1062
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1151	1326
+    32.00    1151    1326
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.08	231	269
+    11.08    231    269
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.08	259	288
+    11.08    259    288
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.16	616	873
+    9.16    616    873
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.04	59	638
+    9.04    59    638
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.07	65	664
+    9.07    65    664
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.04	59	664
+    9.04    59    664
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.01	5	665
+    9.01    5    665
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.05	7	680
+    9.05    7    680
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.01	5	671
+    9.01    5    671
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.77	209	746
+    8.77    209    746
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.77	207	738
+    8.77    207    738
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.25	81	324
+    10.25    81    324
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.18	372	625
+    9.18    372    625
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.98	387	479
+    9.98    387    479
 
 # sparsity 9
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.41	31	238
+    10.41    31    238
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.15	637	1070
+    10.15    637    1070
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1145	1413
+    32.00    1145    1413
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.15	221	267
+    12.15    221    267
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.15	252	284
+    12.15    252    284
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.15	609	849
+    10.15    609    849
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.03	54	624
+    10.03    54    624
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.06	60	650
+    10.06    60    650
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.03	54	649
+    10.03    54    649
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.00	5	653
+    10.00    5    653
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.03	7	666
+    10.03    7    666
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.00	5	659
+    10.00    5    659
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.75	207	739
+    9.75    207    739
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.75	206	743
+    9.75    206    743
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.72	73	313
+    11.72    73    313
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.17	369	611
+    10.17    369    611
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.98	381	474
+    10.98    381    474
 
 # sparsity 10
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.57	29	236
+    11.57    29    236
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.28	626	1033
+    11.28    626    1033
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1141	1328
+    32.00    1141    1328
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.18	219	276
+    13.18    219    276
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.18	254	294
+    13.18    254    294
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.28	610	848
+    11.28    610    848
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.19	50	617
+    11.19    50    617
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.23	56	638
+    11.23    56    638
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.19	50	640
+    11.19    50    640
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.13	4	640
+    11.13    4    640
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.17	6	655
+    11.17    6    655
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.13	4	647
+    11.13    4    647
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.87	207	736
+    10.87    207    736
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.87	204	734
+    10.87    204    734
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.19	68	311
+    13.19    68    311
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.30	369	612
+    11.30    369    612
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.13	386	477
+    12.13    386    477
 
 # sparsity 11
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.41	29	229
+    12.41    29    229
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.01	634	1046
+    12.01    634    1046
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1148	1365
+    32.00    1148    1365
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.84	208	261
+    13.84    208    261
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.84	241	277
+    13.84    241    277
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.01	605	832
+    12.01    605    832
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.07	49	650
+    12.07    49    650
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.10	54	674
+    12.10    54    674
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.07	49	675
+    12.07    49    675
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.93	4	604
+    11.93    4    604
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.97	6	618
+    11.97    6    618
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.93	4	611
+    11.93    4    611
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.60	206	724
+    11.60    206    724
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.60	203	724
+    11.60    203    724
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.98	66	291
+    13.98    66    291
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.04	367	603
+    12.04    367    603
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.86	385	478
+    12.86    385    478
 
 # sparsity 12
 # generating random data...
 # generating random data... ok.
 # kamikaze PForDelta
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.48	28	236
+    13.48    28    236
 
 # IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.96	634	1051
+    12.96    634    1051
 
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1150	1307
+    32.00    1150    1307
 
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.69	202	258
+    14.69    202    258
 
 # IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.69	235	272
+    14.69    235    272
 
 # BinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.96	610	849
+    12.96    610    849
 
 # NewPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.17	48	698
+    13.17    48    698
 
 # NewPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.20	52	714
+    13.20    52    714
 
 # NewPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.17	48	720
+    13.17    48    720
 
 # OptPFD + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.96	4	588
+    12.96    4    588
 
 # OptPFDS9 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.00	6	602
+    13.00    6    602
 
 # OptPFDS16 + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.96	4	597
+    12.96    4    597
 
 # IntegratedFastPFOR + IntegratedVariableByte (Integrated)
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.55	206	726
+    12.55    206    726
 
 # FastPFOR + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.55	203	725
+    12.55    203    725
 
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	15.40	63	269
+    15.40    63    269
 
 # XorBinaryPacking + VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.99	368	613
+    12.99    368    613
 
 # DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.81	384	476
+    13.81    384    476
 
 
 Results were written into a CSV file: benchmark-20131112T105209.csv
diff --git a/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt b/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt
index 4159637..7e35696 100644
--- a/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt
+++ b/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt
@@ -1,7 +1,7 @@
 # benchmark based on the ClusterData model from:
-# 	 Vo Ngoc Anh and Alistair Moffat. 
-#	 Index compression using 64-bit words.
-# 	 Softw. Pract. Exper.40, 2 (February 2010), 131-147. 
+#      Vo Ngoc Anh and Alistair Moffat. 
+#     Index compression using 64-bit words.
+#      Softw. Pract. Exper.40, 2 (February 2010), 131-147. 
 
 # Results will be written into a CSV file: benchmark-20140918T011322.csv
 
@@ -10,504 +10,504 @@
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.04	840	1619
+    3.04    840    1619
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1169	1698
+    32.00    1169    1698
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	195	369
+    8.00    195    369
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.04	651	1148
+    3.04    651    1148
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.96	129	865
+    2.96    129    865
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.98	158	877
+    2.98    158    877
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.96	130	879
+    2.96    130    879
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.95	25	930
+    2.95    25    930
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.97	27	951
+    2.97    27    951
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.95	25	935
+    2.95    25    935
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.82	235	928
+    2.82    235    928
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.93	255	740
+    2.93    255    740
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	2.77	147	395
+    2.77    147    395
 # sparsity 2
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	831	1555
+    3.82    831    1555
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1183	1800
+    32.00    1183    1800
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.00	220	372
+    8.00    220    372
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.82	659	1139
+    3.82    659    1139
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.72	116	855
+    3.72    116    855
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.75	136	851
+    3.75    136    851
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.72	115	853
+    3.72    115    853
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.71	19	895
+    3.71    19    895
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.74	22	917
+    3.74    22    917
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.71	19	900
+    3.71    19    900
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.59	230	908
+    3.59    230    908
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.74	195	654
+    3.74    195    654
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	3.49	111	366
+    3.49    111    366
 # sparsity 3
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.78	817	1519
+    4.78    817    1519
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1019	1759
+    32.00    1019    1759
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.02	238	370
+    8.02    238    370
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.78	680	1121
+    4.78    680    1121
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.67	98	825
+    4.67    98    825
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.70	123	840
+    4.70    123    840
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.67	102	834
+    4.67    102    834
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.66	15	861
+    4.66    15    861
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.69	18	895
+    4.69    18    895
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.66	14	871
+    4.66    14    871
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.54	231	904
+    4.54    231    904
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.84	153	589
+    4.84    153    589
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	4.40	83	339
+    4.40    83    339
 # sparsity 4
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.65	788	1505
+    5.65    788    1505
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1142	1757
+    32.00    1142    1757
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.09	242	363
+    8.09    242    363
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.65	636	1113
+    5.65    636    1113
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.52	92	828
+    5.52    92    828
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.56	112	826
+    5.56    112    826
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.52	94	826
+    5.52    94    826
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.51	12	854
+    5.51    12    854
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.54	15	883
+    5.54    15    883
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.51	12	858
+    5.51    12    858
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.39	218	886
+    5.39    218    886
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.80	136	566
+    5.80    136    566
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	5.32	68	319
+    5.32    68    319
 # sparsity 5
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.31	804	1490
+    6.31    804    1490
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1105	1860
+    32.00    1105    1860
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.40	245	330
+    8.40    245    330
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.31	673	1121
+    6.31    673    1121
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.19	87	832
+    6.19    87    832
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.22	107	844
+    6.22    107    844
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.19	88	830
+    6.19    88    830
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.17	10	851
+    6.17    10    851
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.20	14	883
+    6.20    14    883
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.17	10	852
+    6.17    10    852
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.07	217	875
+    6.07    217    875
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.51	130	513
+    6.51    130    513
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.13	60	307
+    6.13    60    307
 # sparsity 6
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.99	742	1431
+    6.99    742    1431
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1163	1660
+    32.00    1163    1660
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.99	205	290
+    8.99    205    290
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.99	637	1107
+    6.99    637    1107
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.87	82	821
+    6.87    82    821
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.90	100	830
+    6.90    100    830
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.87	82	818
+    6.87    82    818
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.85	9	834
+    6.85    9    834
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.88	12	865
+    6.88    12    865
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.85	9	836
+    6.85    9    836
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.75	224	877
+    6.75    224    877
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	7.33	118	485
+    7.33    118    485
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	6.98	54	296
+    6.98    54    296
 # sparsity 7
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.31	770	1463
+    8.31    770    1463
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1074	1832
+    32.00    1074    1832
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.01	203	240
+    10.01    203    240
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.31	664	1105
+    8.31    664    1105
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.18	73	796
+    8.18    73    796
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.22	88	808
+    8.22    88    808
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.18	73	792
+    8.18    73    792
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.16	7	819
+    8.16    7    819
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.20	10	849
+    8.20    10    849
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.16	7	810
+    8.16    7    810
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.05	217	851
+    8.05    217    851
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.01	103	430
+    9.01    103    430
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.61	47	277
+    8.61    47    277
 # sparsity 8
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.88	800	1414
+    8.88    800    1414
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1078	1718
+    32.00    1078    1718
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.91	211	227
+    10.91    211    227
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.88	671	1083
+    8.88    671    1083
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.76	70	804
+    8.76    70    804
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.80	84	814
+    8.80    84    814
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.76	70	800
+    8.76    70    800
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.73	7	807
+    8.73    7    807
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.77	9	792
+    8.77    9    792
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.73	7	801
+    8.73    7    801
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	8.64	211	837
+    8.64    211    837
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.94	96	417
+    9.94    96    417
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.51	44	268
+    9.51    44    268
 # sparsity 9
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.19	834	1442
+    10.19    834    1442
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1200	1632
+    32.00    1200    1632
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.16	206	212
+    12.16    206    212
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.19	675	1092
+    10.19    675    1092
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.07	64	804
+    10.07    64    804
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.10	76	814
+    10.10    76    814
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.07	63	802
+    10.07    63    802
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.04	6	810
+    10.04    6    810
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.08	9	841
+    10.08    9    841
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.04	6	808
+    10.04    6    808
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	9.94	222	858
+    9.94    222    858
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.79	88	397
+    11.79    88    397
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.40	38	253
+    11.40    38    253
 # sparsity 10
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.10	814	1406
+    11.10    814    1406
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1215	1820
+    32.00    1215    1820
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.07	207	208
+    13.07    207    208
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.10	681	1073
+    11.10    681    1073
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.00	60	800
+    11.00    60    800
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.04	72	809
+    11.04    72    809
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	11.00	60	796
+    11.00    60    796
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.95	6	785
+    10.95    6    785
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.99	8	815
+    10.99    8    815
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.95	6	782
+    10.95    6    782
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	10.84	226	860
+    10.84    226    860
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.93	84	389
+    12.93    84    389
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.58	36	245
+    12.58    36    245
 # sparsity 11
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.31	814	1392
+    12.31    814    1392
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1171	1846
+    32.00    1171    1846
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.14	172	201
+    14.14    172    201
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.31	668	1071
+    12.31    668    1071
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.42	56	825
+    12.42    56    825
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.45	67	832
+    12.45    67    832
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.42	56	821
+    12.42    56    821
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.22	5	729
+    12.22    5    729
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.27	8	758
+    12.27    8    758
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.22	5	731
+    12.22    5    731
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.07	222	836
+    12.07    222    836
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.31	81	377
+    14.31    81    377
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.05	33	238
+    14.05    33    238
 # sparsity 12
 # generating random data...
 # generating random data... ok.
 # IntegratedBinaryPacking + IntegratedVariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.97	805	1375
+    12.97    805    1375
 # JustCopy
 # bits per int, compress speed (mis), decompression speed (mis) 
-	32.00	1160	1737
+    32.00    1160    1737
 # VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	14.72	186	193
+    14.72    186    193
 # BinaryPacking+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.97	656	1037
+    12.97    656    1037
 # NewPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.22	56	886
+    13.22    56    886
 # NewPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.24	67	891
+    13.24    67    891
 # NewPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.22	56	883
+    13.22    56    883
 # OptPFD+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.98	5	704
+    12.98    5    704
 # OptPFDS9+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	13.02	8	740
+    13.02    8    740
 # OptPFDS16+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.98	5	704
+    12.98    5    704
 # FastPFOR+VariableByte
 # bits per int, compress speed (mis), decompression speed (mis) 
-	12.73	223	845
+    12.73    223    845
 # Simple9
 # bits per int, compress speed (mis), decompression speed (mis) 
-	15.35	78	347
+    15.35    78    347
 # Simple16
 # bits per int, compress speed (mis), decompression speed (mis) 
-	15.15	31	225
+    15.15    31    225
 
 Results were written into a CSV file: benchmark-20140918T011322.csv
diff --git a/build.xml b/build.xml
index 974a14c..d02cddd 100644
--- a/build.xml
+++ b/build.xml
@@ -8,6 +8,16 @@
     <depend srcdir="src/main/java" destdir="target/classes" />
     <javac includeantruntime="false" destdir="target/classes">
       <src path="src/main/java" />
+      <exclude name="me/lemire/integercompression/vector/*.java" />
+      <exclude name="module-info.java" />
+    </javac>
+  </target>
+
+  <target name="vector-fastpfor">
+    <mkdir dir="target/classes" />
+    <depend srcdir="src/main/java" destdir="target/classes" />
+    <javac includeantruntime="false" destdir="target/classes" release="19">
+      <src path="src/main/java" />
     </javac>
   </target>
 
diff --git a/example.java b/example.java
index 6569ebd..75dfb05 100644
--- a/example.java
+++ b/example.java
@@ -88,8 +88,7 @@ public static void basicExample() {
     /**
      * Like the basicExample, but we store the input array size manually.
      */
-    @Test
-    public void basicExampleHeadless() {
+    public static void basicExampleHeadless() {
         int[] data = new int[2342351];
         System.out.println("Compressing " + data.length + " integers in one go using the headless approach");
         // data should be sorted for best
@@ -105,11 +104,7 @@ public void basicExampleHeadless() {
         // be processed using variable byte
         SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
                 new IntegratedVariableByte());
-        // output vector should be large enough...
-        int[] compressed = new int[data.length + 1024];
-        // compressed might not be large enough in some cases
-        // if you get java.lang.ArrayIndexOutOfBoundsException, try
-        // allocating more memory
+        int[] compressed = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), data.length)];
 
         /**
          *
@@ -118,7 +113,7 @@ public void basicExampleHeadless() {
          */
         IntWrapper inputoffset = new IntWrapper(0);
         IntWrapper outputoffset = new IntWrapper(1);
-        compressed[0] = data.length; // we manually store how many integers we
+        compressed[0] = data.length; // we manually store how many integers
         codec.headlessCompress(data, inputoffset, data.length, compressed, outputoffset, new IntWrapper(0));
         // got it!
         // inputoffset should be at data.length but outputoffset tells
@@ -268,10 +263,12 @@ public static void headlessDemo() {
         int[] uncompressed1 = {1,2,1,3,1};
         int[] uncompressed2 = {3,2,4,6,1};
 
-        int[] compressed = new int[uncompressed1.length+uncompressed2.length+1024];
-
         SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
 
+        int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed1.length)
+                                  + codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed2.length);
+        int[] compressed = new int[maxCompressedLength];
+
         // compressing
         IntWrapper outPos = new IntWrapper();
 
diff --git a/examples/vector/Example.java b/examples/vector/Example.java
new file mode 100644
index 0000000..e8d2455
--- /dev/null
+++ b/examples/vector/Example.java
@@ -0,0 +1,67 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+import java.util.Arrays;
+import me.lemire.integercompression.FastPFOR;
+import me.lemire.integercompression.IntWrapper;
+import me.lemire.integercompression.Composition;
+import me.lemire.integercompression.IntegerCODEC;
+import me.lemire.integercompression.VariableByte;
+import me.lemire.integercompression.vector.VectorFastPFOR;
+
+public class Example {
+  public static void main(String[] args) {
+    if (args.length == 0)
+      throw new IllegalArgumentException();
+
+    // pass 0 for Vector compressor , non-zero for default compressor
+    int compressorToUse = Integer.parseInt(args[0]);
+
+    final int N = 1310720;
+    int[] data = new int[N];
+
+    // 2-bit data
+    for (int k = 0; k < N; k += 1)
+      data[k] = 3;
+
+    // a few large values
+    for (int k = 0; k < N; k += 5)
+      data[k] = 100;
+    for (int k = 0; k < N; k += 533)
+      data[k] = 10000;
+
+    int[] compressed = new int[N + 1024];
+
+    IntegerCODEC codec = new Composition(
+        compressorToUse == 0 ? new VectorFastPFOR() : new FastPFOR(),
+        new VariableByte());
+
+    IntWrapper inputoffset = new IntWrapper(0);
+    IntWrapper outputoffset = new IntWrapper(0);
+
+    codec.compress(data, inputoffset, data.length, compressed, outputoffset);
+
+    System.out.println("compressed unsorted integers from " +
+                       data.length * 4 / 1024 + "KB to " +
+                       outputoffset.intValue() * 4 / 1024 + "KB");
+
+    compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+    int[] recovered = new int[N];
+    IntWrapper recoffset = new IntWrapper(0);
+
+    codec.uncompress(compressed, new IntWrapper(0), compressed.length,
+                     recovered, recoffset);
+
+    System.out.println("compressed length = " + compressed.length +
+                       ", uncompressed length = " + recoffset.intValue());
+
+    if (Arrays.equals(data, recovered))
+      System.out.println("data is recovered without loss");
+    else
+      throw new RuntimeException("bug"); // could use assert
+
+    System.out.println();
+  }
+}
diff --git a/examples/vector/README.md b/examples/vector/README.md
new file mode 100644
index 0000000..cbcbfeb
--- /dev/null
+++ b/examples/vector/README.md
@@ -0,0 +1,12 @@
+Compile
+-------
+```
+javac -cp <path/to/javafastpfor.jar> Example.java
+```
+
+Run
+---
+```
+java --add-modules jdk.incubator.vector -cp <path/to/javafastpfor.jar> Example 0
+```
+
diff --git a/jitpack.yml b/jitpack.yml
new file mode 100644
index 0000000..255e0f4
--- /dev/null
+++ b/jitpack.yml
@@ -0,0 +1,5 @@
+jdk:
+  - openjdk21
+before_install:
+  - sdk install java 21-open
+  - sdk use java 21-open
diff --git a/pom.xml b/pom.xml
index 7a36b12..33db8e6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2,12 +2,14 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>me.lemire.integercompression</groupId>
   <artifactId>JavaFastPFOR</artifactId>
-  <version>0.1.13-SNAPSHOT</version>
+  <version>0.3.11-SNAPSHOT</version>
   <packaging>jar</packaging>
   <properties>
-    <maven.compiler.source>1.6</maven.compiler.source>
-    <maven.compiler.target>1.6</maven.compiler.target>
+    <maven.compiler.source>21</maven.compiler.source>
+    <maven.compiler.target>21</maven.compiler.target>
+    <maven.compiler.release>21</maven.compiler.release>
     <encoding>UTF-8</encoding>
+    <njord.version>0.8.5</njord.version>
   </properties>
   <licenses>
     <license>
@@ -18,18 +20,31 @@
     </license>
   </licenses>
   <scm>
-    <connection>scm:git:git@github.com:lemire/JavaFastPFOR.git</connection>
-    <url>scm:git:git@github.com:lemire/JavaFastPFOR.git</url>
-    <developerConnection>scm:git:git@github.com:lemire/JavaFastPFOR.git</developerConnection>
+    <connection>scm:git:https://github.com/fast-pack/JavaFastPFOR.git</connection>
+    <url>scm:git:https://github.com/fast-pack/JavaFastPFOR.git</url>
+    <developerConnection>scm:git:https://github.com/fast-pack/JavaFastPFOR.git</developerConnection>
+    <tag>HEAD</tag>
   </scm>
+
+  <distributionManagement>
+    <snapshotRepository>
+      <id>sonatype-central-portal</id>
+      <name>Sonatype Central Portal</name>
+      <url>https://central.sonatype.com/repository/maven-snapshots/</url>
+    </snapshotRepository>
+    <repository>
+      <id>sonatype-central-portal</id>
+      <name>Sonatype Central Portal</name>
+      <url>https://repo.maven.apache.org/maven2/</url>
+    </repository>
+  </distributionManagement>
+
   <developers>
     <developer>
       <id>lemire</id>
       <name>Daniel Lemire</name>
-      <email>lemire@gmail.com</email>
+      <email>daniel@lemire.me</email>
       <url>http://lemire.me/en/</url>
-      <organization>LICEF Research Center</organization>
-      <organizationUrl>http://licef.ca</organizationUrl>
       <roles>
         <role>architect</role>
         <role>developer</role>
@@ -45,21 +60,64 @@
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
-      <version>4.10</version>
+      <version>4.13.1</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.roaringbitmap</groupId>
+      <artifactId>RoaringBitmap</artifactId>
+      <version>0.9.35</version>
       <scope>test</scope>
     </dependency>
   </dependencies>
   <issueManagement>
     <system>GitHub Issue Tracking</system>
-    <url>https://github.com/lemire/JavaFastPFOR/issues</url>
+    <url>https://github.com/fast-pack/JavaFastPFOR/issues</url>
   </issueManagement>
-  <parent>
-    <groupId>org.sonatype.oss</groupId>
-    <artifactId>oss-parent</artifactId>
-    <version>9</version>
-  </parent>
   <build>
+    <extensions>
+      <extension>
+        <groupId>eu.maveniverse.maven.njord</groupId>
+        <artifactId>extension3</artifactId>
+        <version>${njord.version}</version>
+      </extension>
+    </extensions>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.12.1</version>
+        <configuration>
+            <source>21</source>
+            <target>21</target>
+        </configuration>
+        <executions>
+          <execution>
+            <id>default-compile</id>
+            <phase>compile</phase>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+            <configuration>
+              <excludes>
+                <exclude>me/lemire/integercompression/vector/*</exclude>
+                <exclude>module-info.java</exclude>
+              </excludes>
+            </configuration>
+          </execution>
+          <!-- The vector module is experimental, currently only for advanced users. -->
+          <!--<execution>
+            <id>vector-fastpfor</id>
+            <phase>compile</phase>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+             <configuration>
+              <release>29</release>
+            </configuration>
+          </execution>       -->
+        </executions>
+      </plugin>
       <plugin>
         <groupId>org.apache.felix</groupId>
         <artifactId>maven-bundle-plugin</artifactId>
@@ -78,31 +136,14 @@
         <version>1.1</version>
         <configuration>
           <mainClass>me.lemire.integercompression.benchmarktools.Benchmark</mainClass>
-          <!--
-    <mainClass>me.lemire.integercompression.benchmarktools.BenchmarkOffsettedSeries</mainClass>
-    -->
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-gpg-plugin</artifactId>
-        <version>1.4</version>
-        <executions>
-          <execution>
-            <id>sign-artifacts</id>
-            <phase>verify</phase>
-            <goals>
-              <goal>sign</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-javadoc-plugin</artifactId>
-        <version>2.8</version>
+        <version>3.6.3</version>
         <configuration>
-            <excludePackageNames>com.kamikaze.pfordelta:me.lemire.integercompression.benchmarktools</excludePackageNames>
+            <excludePackageNames>me.lemire.integercompression.vector;com.kamikaze.pfordelta:me.lemire.integercompression.benchmarktools</excludePackageNames>
        </configuration>
         <executions>
           <execution>
@@ -129,7 +170,7 @@
       <plugin>
         <groupId>org.jacoco</groupId>
         <artifactId>jacoco-maven-plugin</artifactId>
-        <version>0.7.8</version>
+        <version>0.8.13</version>
         <configuration>
          <excludes>
             <exclude>me/lemire/integercompression/Kamikaze</exclude>
@@ -147,16 +188,79 @@
         </executions>
       </plugin>
       <plugin>
-        <groupId>org.eluder.coveralls</groupId>
-        <artifactId>coveralls-maven-plugin</artifactId>
-        <version>3.2.1</version>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-release-plugin</artifactId>
+        <version>3.0.1</version>
+        <configuration>
+          <goals>deploy</goals>
+          <autoVersionSubmodules>true</autoVersionSubmodules>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-gpg-plugin</artifactId>
+        <version>3.2.8</version>
+        <executions>
+          <execution>
+            <id>sign-artifacts</id>
+            <phase>verify</phase>
+            <goals>
+              <goal>sign</goal>
+            </goals>
+          </execution>
+        </executions>
       </plugin>
     </plugins>
+
+        <pluginManagement>
+            <plugins>
+                <plugin>
+                    <groupId>eu.maveniverse.maven.plugins</groupId>
+                    <artifactId>njord</artifactId>
+                    <version>${njord.version}</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-clean-plugin</artifactId>
+                    <version>2.5</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-deploy-plugin</artifactId>
+                    <version>2.8.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-install-plugin</artifactId>
+                    <version>2.5.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <version>2.4</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-javadoc-plugin</artifactId>
+                    <version>2.9.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-resources-plugin</artifactId>
+                    <version>2.6</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-site-plugin</artifactId>
+                    <version>3.3</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-source-plugin</artifactId>
+                    <version>2.2.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>2.17</version>
+                </plugin>
+            </plugins>
+        </pluginManagement>
   </build>
   <name>JavaFastPFOR</name>
-  <url>https://github.com/lemire/JavaFastPFOR/</url>
+  <url>https://github.com/fast-pack/JavaFastPFOR/</url>
   <description>
-It is a library to compress and uncompress arrays of integers
-very fast. The assumption is that most (but not all) values in
-your array use less than 32 bits. </description>
+A library to compress and uncompress arrays of integers
+very quickly. </description>
   </project>
diff --git a/src/main/java/me/lemire/integercompression/BinaryPacking.java b/src/main/java/me/lemire/integercompression/BinaryPacking.java
index 8d5ff90..ce37ff0 100644
--- a/src/main/java/me/lemire/integercompression/BinaryPacking.java
+++ b/src/main/java/me/lemire/integercompression/BinaryPacking.java
@@ -37,8 +37,9 @@
  * @author Daniel Lemire
  */
 public final class BinaryPacking implements IntegerCODEC, SkippableIntegerCODEC {
-        final static int BLOCK_SIZE = 32;
-    
+        public final static int BLOCK_SIZE = 32;
+        private static final int MAX_BIT_WIDTH = Integer.SIZE;
+
         @Override
         public void compress(int[] in, IntWrapper inpos, int inlength,
                 int[] out, IntWrapper outpos) {
@@ -131,7 +132,16 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
             outpos.add(outlength);
             inpos.set(tmpinpos);
         }
-        
+
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+            int blockCount = inlength / BLOCK_SIZE;
+            int headersSizeInInts = blockCount / Integer.BYTES + (blockCount % Integer.BYTES);
+            int blocksSizeInInts = blockCount * MAX_BIT_WIDTH;
+            compressedPositions.add(blockCount * BLOCK_SIZE);
+            return headersSizeInInts + blocksSizeInInts;
+        }
+
         @Override
         public String toString() {
                 return this.getClass().getSimpleName();
diff --git a/src/main/java/me/lemire/integercompression/BitPacking.java b/src/main/java/me/lemire/integercompression/BitPacking.java
index e83c9e0..8652be4 100644
--- a/src/main/java/me/lemire/integercompression/BitPacking.java
+++ b/src/main/java/me/lemire/integercompression/BitPacking.java
@@ -1690,7 +1690,7 @@ protected static void fastpack9(final int[] in, int inpos,
         }
 
         /**
-         * Unpack 32 integers
+         * Pack without mask 32 integers
          * 
          * @param in
          *                source array
@@ -3005,7 +3005,7 @@ protected static void fastpackwithoutmask9(final int[] in, int inpos,
         }
 
         /**
-         * Pack the 32 integers
+         * Unpack the 32 integers
          * 
          * @param in
          *                source array
diff --git a/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java b/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java
index 47d4f57..6e8f903 100644
--- a/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java
@@ -18,9 +18,9 @@ public interface ByteIntegerCODEC {
          * Compress data from an array to another array.
          * 
          * Both inpos and outpos are modified to represent how much data was
-         * read and written to if 12 ints (inlength = 12) are compressed to 3
+         * read and written to. If 12 ints (inlength = 12) are compressed to 3
          * bytes, then inpos will be incremented by 12 while outpos will be
-         * incremented by 3 we use IntWrapper to pass the values by reference.
+         * incremented by 3. We use IntWrapper to pass the values by reference.
          * 
          * @param in
          *                input array
diff --git a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java
index 4b2f896..2f8c709 100644
--- a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java
+++ b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java
@@ -105,7 +105,7 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen,
 
                 int ip = inPos.get();
                 int op = outPos.get();
-                int vbcNum = 0, vbcShift = 24; // Varialbe Byte Context.
+                int vbcNum = 0, vbcShift = 24; // Variable Byte Context.
                 final int inPosLast = ip + inLen;
                 while (ip < inPosLast) {
                         // Fetch a byte value.
@@ -134,6 +134,9 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen,
          * In case you need a different way to allocate buffers, you can override this method
          * with a custom behavior. The default implementation allocates a new Java direct
          * {@link ByteBuffer} on each invocation.
+         * 
+         * @param sizeInBytes
+         * @return
          */
         protected ByteBuffer makeBuffer(int sizeInBytes) {
                 return ByteBuffer.allocateDirect(sizeInBytes);
diff --git a/src/main/java/me/lemire/integercompression/FastPFOR.java b/src/main/java/me/lemire/integercompression/FastPFOR.java
index 36226c0..5475496 100644
--- a/src/main/java/me/lemire/integercompression/FastPFOR.java
+++ b/src/main/java/me/lemire/integercompression/FastPFOR.java
@@ -40,6 +40,13 @@
  */
 public class FastPFOR implements IntegerCODEC,SkippableIntegerCODEC {
         final static int OVERHEAD_OF_EACH_EXCEPT = 8;
+        private static final int OVERHEAD_OF_EACH_PAGE_IN_INTS = 36; // 1 int for the header
+                                                                     // 1 int for the byte array size
+                                                                     // 1 int for the bitmap
+                                                                     // 1 int for byte array padding (to align to 4 bytes)
+                                                                     // 32 to have enough space to bit-pack the exceptions
+        private static final int OVERHEAD_OF_EACH_BLOCK_IN_INTS = 1; // 1 byte for the number of bits allocated per truncated integer
+                                                                     // 1 byte for the number of exceptions
         /**
          *
          */
@@ -65,7 +72,7 @@ public class FastPFOR implements IntegerCODEC,SkippableIntegerCODEC {
          * @param pagesize
          *                the desired page size (recommended value is FastPFOR.DEFAULT_PAGE_SIZE)
          */
-        private FastPFOR(int pagesize) {
+        FastPFOR(int pagesize) {
             pageSize = pagesize;
             // Initiate arrrays.
             byteContainer = makeBuffer(3 * pageSize
@@ -230,6 +237,18 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 }
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+            inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+
+            int pageCount = (inlength + pageSize - 1) / pageSize;
+            int blockCount = inlength / BLOCK_SIZE;
+
+            // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+            int blockSizeInInts = OVERHEAD_OF_EACH_BLOCK_IN_INTS + BLOCK_SIZE;
+            return OVERHEAD_OF_EACH_PAGE_IN_INTS * pageCount + blockSizeInInts * blockCount + 24;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 final int initpos = inpos.get();
@@ -336,6 +355,9 @@ public String toString() {
          * In case you need a different way to allocate buffers, you can override this method
          * with a custom behavior. The default implementation allocates a new Java direct
          * {@link ByteBuffer} on each invocation.
+         * 
+         * @param sizeInBytes
+         * @return
          */
         protected ByteBuffer makeBuffer(int sizeInBytes) {
             return ByteBuffer.allocateDirect(sizeInBytes);
diff --git a/src/main/java/me/lemire/integercompression/FastPFOR128.java b/src/main/java/me/lemire/integercompression/FastPFOR128.java
index b124072..0557c62 100644
--- a/src/main/java/me/lemire/integercompression/FastPFOR128.java
+++ b/src/main/java/me/lemire/integercompression/FastPFOR128.java
@@ -23,6 +23,13 @@
  */
 public class FastPFOR128 implements IntegerCODEC,SkippableIntegerCODEC {
         final static int OVERHEAD_OF_EACH_EXCEPT = 8;
+        private static final int OVERHEAD_OF_EACH_PAGE_IN_INTS = 36; // 1 int for the header
+                                                                     // 1 int for the byte array size
+                                                                     // 1 int for the bitmap
+                                                                     // 1 int for byte array padding (to align to 4 bytes)
+                                                                     // 32 to have enough space to bit-pack the exceptions
+        private static final int OVERHEAD_OF_EACH_BLOCK_IN_INTS = 1; // 1 byte for the number of bits allocated per truncated integer
+                                                                     // 1 byte for the number of exceptions
         /**
          *
          */
@@ -209,6 +216,18 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 }
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+            inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+
+            int pageCount = (inlength + pageSize - 1) / pageSize;
+            int blockCount = inlength / BLOCK_SIZE;
+
+            // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+            int blockSizeInInts = OVERHEAD_OF_EACH_BLOCK_IN_INTS + BLOCK_SIZE;
+            return OVERHEAD_OF_EACH_PAGE_IN_INTS * pageCount + blockSizeInInts * blockCount + 24;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 final int initpos = inpos.get();
@@ -317,6 +336,9 @@ public String toString() {
          * In case you need a different way to allocate buffers, you can override this method
          * with a custom behavior. The default implementation allocates a new Java direct
          * {@link ByteBuffer} on each invocation.
+         * 
+         * @param sizeInBytes
+         * @return
          */
         protected ByteBuffer makeBuffer(int sizeInBytes) {
             return ByteBuffer.allocateDirect(sizeInBytes);
diff --git a/src/main/java/me/lemire/integercompression/GroupSimple9.java b/src/main/java/me/lemire/integercompression/GroupSimple9.java
index 0ce10ce..bd8acfa 100644
--- a/src/main/java/me/lemire/integercompression/GroupSimple9.java
+++ b/src/main/java/me/lemire/integercompression/GroupSimple9.java
@@ -13,3540 +13,3546 @@
 
 public final class GroupSimple9 implements IntegerCODEC, SkippableIntegerCODEC {
 
-	private static final int[][] M = { { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, { 9, 10, 11, 12, 13, 14, 15, 16, 17 },
-			{ 18, 19, 20, 21, 22, 23, 24, 25, 26 }, { 27, 28, 29, 30, 31, 32, 33, 34, 35 },
-			{ 36, 37, 38, 39, 40, 41, 42, 43, 44 }, { 45, 46, 47, 48, 49, 50, 51, 52, 53 },
-			{ 54, 55, 56, 57, 58, 59, 60, 61, 62 }, { 63, 64, 65, 66, 67, 68, 69, 70, 71 },
-			{ 72, 73, 74, 75, 76, 77, 78, 79, 80 } };
-
-	@Override
-	public void compress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
-		if (inlength == 0)
-			return;
-		out[outpos.get()] = inlength;
-		outpos.increment();
-		headlessCompress(in, inpos, inlength, out, outpos);
-	}
-
-	private void encode0(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + (in[inf + i]);
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode1(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode2(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 28 + i];// 第二个28位是低位存储的，所以浪费的1比特在最顶端。
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode3(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode4(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode5(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode6(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode7(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode8(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 24; i++)
-			out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 28 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode9(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode10(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++) {
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-
-		}
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode11(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode12(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode13(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode14(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode15(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode16(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode17(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 12; i++)
-			out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 14 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode18(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 9 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode19(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 9 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode20(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 9 + i];
-
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode21(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 9 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode22(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 9 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode23(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 9 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode24(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 9 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode25(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 9 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode26(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 8; i++)
-			out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 9 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode27(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode28(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode29(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode30(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode31(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode32(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode33(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode34(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode35(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 6; i++)
-			out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 7 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode36(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode37(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode38(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode39(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode40(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode41(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode42(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode43(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode44(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 4; i++)
-			out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
-		out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 5 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode45(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode46(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode47(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode48(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode49(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode50(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode51(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode52(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode53(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 3; i++)
-			out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 4 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode54(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode55(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode56(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode57(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode58(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode59(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode60(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode61(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode62(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		for (int i = 0; i < 2; i++)
-			out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
-		out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
-		out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 3 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode63(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode64(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode65(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode66(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode67(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode68(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode69(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode70(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode71(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 14) + in[inf];
-		out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 2 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode72(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 28; i++)
-			out[outf + 1] = (out[outf + 1] << 1) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode73(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 14; i++)
-			out[outf + 1] = (out[outf + 1] << 2) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode74(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 9; i++)
-			out[outf + 1] = (out[outf + 1] << 3) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode75(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 7; i++)
-			out[outf + 1] = (out[outf + 1] << 4) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode76(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 5; i++)
-			out[outf + 1] = (out[outf + 1] << 5) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode77(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 4; i++)
-			out[outf + 1] = (out[outf + 1] << 7) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode78(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 3; i++)
-			out[outf + 1] = (out[outf + 1] << 9) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode79(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 2; i++)
-			out[outf + 1] = (out[outf + 1] << 14) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	private void encode80(final int[] in, final int inf, final int code, final int[] out,
-			final int outf) {
-		out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
-		out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
-		for (int i = 0; i < 1; i++)
-			out[outf + 1] = (out[outf + 1] << 28) + in[inf + 1 + i];
-		out[outf + 0] = code << 24 | out[outf + 0];
-		
-	}
-
-	@Override
-	public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
-		if (inlength == 0)
-			return;
-		final int outlength = in[inpos.get()];
-		inpos.increment();
-		headlessUncompress(in, inpos, inlength, out, outpos, outlength);
-	}
-
-	
-	
-	private void decode80(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode79(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
-		// number :2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode78(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 27);
-		// number : 3, bitwidth :9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode77(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode76(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 25);
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode75(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode74(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 27);
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode73(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode72(int val, int valn, int[] out, int currentPos) {
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode71(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode70(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode69(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 27);
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode68(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode67(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 25);
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode66(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode65(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 27);
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode64(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode63(int val, int valn, int[] out, int currentPos) {
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (val << 8) >>> 18;
-		out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode62(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode61(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode60(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 27);
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode59(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode58(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 25);
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode57(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode56(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 27);
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode55(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode54(int val, int valn, int[] out, int currentPos) {
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (val << 8) >>> 23;
-		out[currentPos++] = (val << 17) >>> 23;
-		out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode53(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode52(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode51(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 27);
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode50(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode49(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 25);
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode48(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode47(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 27);
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode46(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode45(int val, int valn, int[] out, int currentPos) {
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (val << 8) >>> 25;
-		out[currentPos++] = (val << 15) >>> 25;
-		out[currentPos++] = (val << 22) >>> 25;
-		out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode44(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode43(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode42(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 27);
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode41(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode40(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 25);
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode39(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode38(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 27);
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode37(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode36(int val, int valn, int[] out, int currentPos) {
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (val << 8) >>> 27;
-		out[currentPos++] = (val << 13) >>> 27;
-		out[currentPos++] = (val << 18) >>> 27;
-		out[currentPos++] = (val << 23) >>> 27;
-		out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode35(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 0) >>> 28;
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode34(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 0) >>> 28;
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode33(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 1) >>> 28;
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode32(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 0) >>> 28;
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode31(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 3) >>> 28;
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode30(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 0) >>> 28;
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode29(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 1) >>> 28;
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode28(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 0) >>> 28;
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode27(int val, int valn, int[] out, int currentPos) {
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (val << 8) >>> 28;
-		out[currentPos++] = (val << 12) >>> 28;
-		out[currentPos++] = (val << 16) >>> 28;
-		out[currentPos++] = (val << 20) >>> 28;
-		out[currentPos++] = (val << 24) >>> 28;
-		out[currentPos++] = (val << 28) >>> 28;
-		out[currentPos++] = (valn << 0) >>> 28;
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode26(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 1) >>> 29;
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode25(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 1) >>> 29;
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-	
-	private void decode24(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 2) >>> 29;
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-	
-	private void decode23(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 1) >>> 29;
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode22(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 4) >>> 29;
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-	}
-
-	private void decode21(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 1) >>> 29;
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-	}
-
-	private void decode20(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 2) >>> 29;
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-	}
-
-	private void decode19(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 1) >>> 29;
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode18(int val, int valn, int[] out, int currentPos) {
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (val << 8) >>> 29;
-		out[currentPos++] = (val << 11) >>> 29;
-		out[currentPos++] = (val << 14) >>> 29;
-		out[currentPos++] = (val << 17) >>> 29;
-		out[currentPos++] = (val << 20) >>> 29;
-		out[currentPos++] = (val << 23) >>> 29;
-		out[currentPos++] = (val << 26) >>> 29;
-		out[currentPos++] = (val << 29) >>> 29;
-		out[currentPos++] = (valn << 1) >>> 29;
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode17(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 0) >>> 30;
-		out[currentPos++] = (valn << 2) >>> 30;
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode16(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 0) >>> 30;
-		out[currentPos++] = (valn << 2) >>> 30;
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode15(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 1) >>> 30;
-		out[currentPos++] = (valn << 3) >>> 30;
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode14(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 0) >>> 30;
-		out[currentPos++] = (valn << 2) >>> 30;
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode13(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 3) >>> 30;
-		out[currentPos++] = (valn << 5) >>> 30;
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;
-		
-	}
-
-	private void decode12(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 0) >>> 30;
-		out[currentPos++] = (valn << 2) >>> 30;
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;
-		
-	}
-
-	private void decode11(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 1) >>> 30;
-		out[currentPos++] = (valn << 3) >>> 30;
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;
-		
-	}
-
-	private void decode10(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 0) >>> 30;
-		out[currentPos++] = (valn << 2) >>> 30;
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;
-	}
-
-	private void decode9(int val, int valn, int[] out, int currentPos) {
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (val << 8) >>> 30;
-		out[currentPos++] = (val << 10) >>> 30;
-		out[currentPos++] = (val << 12) >>> 30;
-		out[currentPos++] = (val << 14) >>> 30;
-		out[currentPos++] = (val << 16) >>> 30;
-		out[currentPos++] = (val << 18) >>> 30;
-		out[currentPos++] = (val << 20) >>> 30;
-		out[currentPos++] = (val << 22) >>> 30; // 10
-		out[currentPos++] = (val << 24) >>> 30;
-		out[currentPos++] = (val << 26) >>> 30;
-		out[currentPos++] = (val << 28) >>> 30;
-		out[currentPos++] = (val << 30) >>> 30;
-		out[currentPos++] = (valn << 0) >>> 30;
-		out[currentPos++] = (valn << 2) >>> 30;
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-	private void decode8(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = valn >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		// number : 1, bitwidth : 28
-		out[currentPos++] = (valn << 4) >>> 4;
-	}
-
-	private void decode7(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = valn >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		// number : 2, bitwidth : 14
-		out[currentPos++] = (valn << 4) >>> 18;
-		out[currentPos++] = (valn << 18) >>> 18;
-	}
-
-	private void decode6(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		out[currentPos++] = (valn << 4) >>> 31;
-		// number : 3, bitwidth : 9
-		out[currentPos++] = (valn << 5) >>> 23;
-		out[currentPos++] = (valn << 14) >>> 23;
-		out[currentPos++] = (valn << 23) >>> 23;
-	}
-
-	private void decode5(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = valn >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		// number : 4, bitwidth : 7
-		out[currentPos++] = (valn << 4) >>> 25;
-		out[currentPos++] = (valn << 11) >>> 25;
-		out[currentPos++] = (valn << 18) >>> 25;
-		out[currentPos++] = (valn << 25) >>> 25;
-	}
-
-	private void decode4(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;// 头部3bit
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		// number : 5, bitwidth : 5
-		out[currentPos++] = (valn << 7) >>> 27;
-		out[currentPos++] = (valn << 12) >>> 27;
-		out[currentPos++] = (valn << 17) >>> 27;
-		out[currentPos++] = (valn << 22) >>> 27;
-		out[currentPos++] = (valn << 27) >>> 27;	
-	}
-
-	private void decode3(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = valn >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		// number : 7, bitwidth : 4
-		out[currentPos++] = (valn << 4) >>> 28;
-		out[currentPos++] = (valn << 8) >>> 28;
-		out[currentPos++] = (valn << 12) >>> 28;
-		out[currentPos++] = (valn << 16) >>> 28;
-		out[currentPos++] = (valn << 20) >>> 28;
-		out[currentPos++] = (valn << 24) >>> 28;
-		out[currentPos++] = (valn << 28) >>> 28;		
-	}
-
-	private void decode2(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;// 头部1bit
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		out[currentPos++] = (valn << 4) >>> 31;
-		// number : 9, bitwidth : 3
-		out[currentPos++] = (valn << 5) >>> 29;
-		out[currentPos++] = (valn << 8) >>> 29;
-		out[currentPos++] = (valn << 11) >>> 29;
-		out[currentPos++] = (valn << 14) >>> 29;
-		out[currentPos++] = (valn << 17) >>> 29;
-		out[currentPos++] = (valn << 20) >>> 29;
-		out[currentPos++] = (valn << 23) >>> 29;
-		out[currentPos++] = (valn << 26) >>> 29;
-		out[currentPos++] = (valn << 29) >>> 29;	
-	}
-
-	private void decode1(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31;// 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = valn >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		// number : 14, bitwidth : 2
-		out[currentPos++] = (valn << 4) >>> 30;
-		out[currentPos++] = (valn << 6) >>> 30;
-		out[currentPos++] = (valn << 8) >>> 30;
-		out[currentPos++] = (valn << 10) >>> 30;
-		out[currentPos++] = (valn << 12) >>> 30;
-		out[currentPos++] = (valn << 14) >>> 30;
-		out[currentPos++] = (valn << 16) >>> 30;
-		out[currentPos++] = (valn << 18) >>> 30;
-		out[currentPos++] = (valn << 20) >>> 30;
-		out[currentPos++] = (valn << 22) >>> 30; // 10
-		out[currentPos++] = (valn << 24) >>> 30;
-		out[currentPos++] = (valn << 26) >>> 30;
-		out[currentPos++] = (valn << 28) >>> 30;
-		out[currentPos++] = (valn << 30) >>> 30;		
-	}
-
-	private void decode0(int val, int valn, int[] out, int currentPos) {
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (val << 8) >>> 31;
-		out[currentPos++] = (val << 9) >>> 31;
-		out[currentPos++] = (val << 10) >>> 31;
-		out[currentPos++] = (val << 11) >>> 31;
-		out[currentPos++] = (val << 12) >>> 31;
-		out[currentPos++] = (val << 13) >>> 31; // 10
-		out[currentPos++] = (val << 14) >>> 31;
-		out[currentPos++] = (val << 15) >>> 31;
-		out[currentPos++] = (val << 16) >>> 31;
-		out[currentPos++] = (val << 17) >>> 31;
-		out[currentPos++] = (val << 18) >>> 31;
-		out[currentPos++] = (val << 19) >>> 31;
-		out[currentPos++] = (val << 20) >>> 31;
-		out[currentPos++] = (val << 21) >>> 31;
-		out[currentPos++] = (val << 22) >>> 31;
-		out[currentPos++] = (val << 23) >>> 31; // 20
-		out[currentPos++] = (val << 24) >>> 31;
-		out[currentPos++] = (val << 25) >>> 31;
-		out[currentPos++] = (val << 26) >>> 31;
-		out[currentPos++] = (val << 27) >>> 31;
-		out[currentPos++] = (val << 28) >>> 31;
-		out[currentPos++] = (val << 29) >>> 31;
-		out[currentPos++] = (val << 30) >>> 31;
-		out[currentPos++] = (val << 31) >>> 31;
-		out[currentPos++] = valn >>> 31;
-		out[currentPos++] = (valn << 1) >>> 31;
-		out[currentPos++] = (valn << 2) >>> 31;
-		out[currentPos++] = (valn << 3) >>> 31;
-		// number : 28, bitwidth : 1
-		out[currentPos++] = (valn << 4) >>> 31;
-		out[currentPos++] = (valn << 5) >>> 31;
-		out[currentPos++] = (valn << 6) >>> 31;
-		out[currentPos++] = (valn << 7) >>> 31;
-		out[currentPos++] = (valn << 8) >>> 31;
-		out[currentPos++] = (valn << 9) >>> 31;
-		out[currentPos++] = (valn << 10) >>> 31;
-		out[currentPos++] = (valn << 11) >>> 31;
-		out[currentPos++] = (valn << 12) >>> 31;
-		out[currentPos++] = (valn << 13) >>> 31; // 10
-		out[currentPos++] = (valn << 14) >>> 31;
-		out[currentPos++] = (valn << 15) >>> 31;
-		out[currentPos++] = (valn << 16) >>> 31;
-		out[currentPos++] = (valn << 17) >>> 31;
-		out[currentPos++] = (valn << 18) >>> 31;
-		out[currentPos++] = (valn << 19) >>> 31;
-		out[currentPos++] = (valn << 20) >>> 31;
-		out[currentPos++] = (valn << 21) >>> 31;
-		out[currentPos++] = (valn << 22) >>> 31;
-		out[currentPos++] = (valn << 23) >>> 31; // 20
-		out[currentPos++] = (valn << 24) >>> 31;
-		out[currentPos++] = (valn << 25) >>> 31;
-		out[currentPos++] = (valn << 26) >>> 31;
-		out[currentPos++] = (valn << 27) >>> 31;
-		out[currentPos++] = (valn << 28) >>> 31;
-		out[currentPos++] = (valn << 29) >>> 31;
-		out[currentPos++] = (valn << 30) >>> 31;
-		out[currentPos++] = (valn << 31) >>> 31;
-	}
-
-
-	private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
-
-	private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
-
-	@Override
-	public String toString() {
-		return this.getClass().getSimpleName();
-	}
-
-	@Override
-	public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
-		int tmpoutpos = outpos.get();
-		int currentPos = inpos.get();
-		int selector1 = 0;
-		int selector2 = 0;
-		final int finalin = currentPos + inlength;
-		while (currentPos < finalin - 28 * 2) {
-			int nextCurrentPos = currentPos;
-					mainloop1: for (selector1=0; selector1 <= 8; selector1++) {
-				int compressedNum = codeNum[selector1];
-				//if (finalin <= nextCurrentPos + compressedNum - 1)
-				//	compressedNum = finalin - nextCurrentPos;
-				int b = bitLength[selector1];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++) {
-					if (Util.smallerorequalthan(max, in[nextCurrentPos + i]))
-						continue mainloop1;
-				}
-				nextCurrentPos += compressedNum;
-				break;
-			}
-			mainloop2: for (selector2 = 0; selector2 <= 8; selector2++) {
-				int compressedNum = codeNum[selector2];
-				//if (finalin <= nextCurrentPos + compressedNum - 1)
-				//	compressedNum = finalin - nextCurrentPos;
-				int b = bitLength[selector2];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++) {
-					if (Util.smallerorequalthan(max, in[nextCurrentPos + i]))
-						continue mainloop2;
-				}
-				nextCurrentPos += compressedNum;
-				break;
-			}
-			int code = M[selector1][selector2];
-			out[tmpoutpos] = 0;
-			out[tmpoutpos + 1] = 0;
-			switch (code) {
-			case 0:
-				encode0(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 1:
-				encode1(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 2:
-				encode2(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 3:
-				encode3(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 4:
-				encode4(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 5:
-				encode5(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 6:
-				encode6(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 7:
-				encode7(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 8:
-				encode8(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 9:
-				encode9(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 10:
-				encode10(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 11:
-				encode11(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 12:
-				encode12(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 13:
-				encode13(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 14:
-				encode14(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 15:
-				encode15(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 16:
-				encode16(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 17:
-				encode17(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 18:
-				encode18(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 19:
-				encode19(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 20:
-				encode20(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 21:
-				encode21(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 22:
-				encode22(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 23:
-				encode23(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 24:
-				encode24(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 25:
-				encode25(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 26:
-				encode26(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 27:
-				encode27(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 28:
-				encode28(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 29:
-				encode29(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 30:
-				encode30(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 31:
-				encode31(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 32:
-				encode32(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 33:
-				encode33(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 34:
-				encode34(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 35:
-				encode35(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 36:
-				encode36(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 37:
-				encode37(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 38:
-				encode38(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 39:
-				encode39(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 40:
-				encode40(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 41:
-				encode41(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 42:
-				encode42(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 43:
-				encode43(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 44:
-				encode44(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 45:
-				encode45(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 46:
-				encode46(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 47:
-				encode47(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 48:
-				encode48(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 49:
-				encode49(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 50:
-				encode50(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 51:
-				encode51(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 52:
-				encode52(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 53:
-				encode53(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 54:
-				encode54(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 55:
-				encode55(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 56:
-				encode56(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 57:
-				encode57(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 58:
-				encode58(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 59:
-				encode59(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 60:
-				encode60(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 61:
-				encode61(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 62:
-				encode62(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 63:
-				encode63(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 64:
-				encode64(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 65:
-				encode65(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 66:
-				encode66(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 67:
-				encode67(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 68:
-				encode68(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 69:
-				encode69(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 70:
-				encode70(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 71:
-				encode71(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 72:
-				encode72(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 73:
-				encode73(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 74:
-				encode74(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 75:
-				encode75(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 76:
-				encode76(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 77:
-				encode77(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 78:
-				encode78(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 79:
-				encode79(in, currentPos, code, out, tmpoutpos);
-				break;
-			case 80:
-				encode80(in, currentPos, code, out, tmpoutpos);
-				break;
-			default:
-				throw new RuntimeException("unsupported code");
-			}// end switch
-			tmpoutpos += 2;
-			currentPos = nextCurrentPos;
-		}
-
-		outer: while (currentPos < finalin) {
-			mainloop: for (int selector = 0; selector < 8; selector++) {
-				int res = 0;
-				int compressedNum = codeNum[selector];
-				if (finalin <= currentPos + compressedNum - 1)
-					compressedNum = finalin - currentPos;
-				int b = bitLength[selector];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++) {
-					if (Util.smallerorequalthan(max, in[currentPos + i]))
-						continue mainloop;
-					res = (res << b) + in[currentPos + i];
-				}
-				if (compressedNum != codeNum[selector]) {
-					res <<= (codeNum[selector] - compressedNum) * b;
-				}
-				res |= selector << 28;
-				out[tmpoutpos++] = res;
-
-				currentPos += compressedNum;
-				continue outer;
-			}
-			final int selector = 8;
-			out[tmpoutpos++] = in[currentPos++] | (selector << 28);
-		}
-		inpos.set(currentPos);
-		outpos.set(tmpoutpos);
-	}
-
-	@Override
-	public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
-		int currentPos = outpos.get();
-		int tmpinpos = inpos.get();
-		final int finalout = currentPos + num;
-		while (currentPos < finalout - 2 * 28) {
-
-			int val = in[tmpinpos++];
-			int valn = in[tmpinpos++];
-			int header = val >>> 24;
-			switch (header) {
-			case 0: {
-				decode0(val, valn, out, currentPos);
-				currentPos+=56;
-				break;
-			}
-			case 1: { 
-				decode1(val, valn, out, currentPos);
-				currentPos+=42;
-				break;
-			}
-			case 2: {
-				decode2(val, valn, out, currentPos);
-				currentPos+=37;
-				break;
-			}
-			case 3: {
-				decode3(val, valn, out, currentPos);
-				currentPos+=35;		
-				break;
-			}
-			case 4: {
-				decode4(val, valn, out, currentPos);
-				currentPos+=33;
-				break;
-			}
-			case 5: {
-				decode5(val, valn, out, currentPos);
-				currentPos+=32;
-				break;
-			}
-			case 6: {
-				decode6(val, valn, out, currentPos);
-				currentPos+=31;
-				break;
-			}
-			case 7: {
-				decode7(val, valn, out, currentPos);
-				currentPos+=30;
-				break;
-			}
-			case 8: {
-				decode8(val, valn, out, currentPos);
-				currentPos+=29;
-				break;
-			}
-			case 9: {
-				decode9(val, valn, out, currentPos);
-				currentPos+=42;	
-				break;
-			}
-			case 10: {
-				decode10(val, valn, out, currentPos);
-				currentPos+=28;	
-				break;
-			}
-			case 11: { 
-				decode11(val, valn, out, currentPos);
-				currentPos+=23;
-				break;
-			}
-			case 12: {
-				decode12(val, valn, out, currentPos);
-				currentPos+=21;
-				break;
-			}
-			case 13: {
-				decode13(val, valn, out, currentPos);
-				currentPos+=19;
-				break;
-			}
-			case 14: {
-				decode14(val, valn, out, currentPos);
-				currentPos+=18;
-				break;
-			}
-			case 15: {
-				decode15(val, valn, out, currentPos);
-				currentPos+=17;
-				break;
-			}
-			case 16: {
-				decode16(val, valn, out, currentPos);
-				currentPos+=16;
-				break;
-			}
-			case 17: {
-				decode17(val, valn, out, currentPos);
-				currentPos+=15;
-				break;
-			}
-			case 18: {
-				decode18(val, valn, out, currentPos);
-				currentPos+=37;
-				break;
-			}
-			case 19: {
-				decode19(val, valn, out, currentPos);
-				currentPos+=23;
-				break;
-			}
-			case 20: {
-				decode20(val, valn, out, currentPos);
-				currentPos+=18;
-				break;
-			}
-			case 21: {
-				decode21(val, valn, out, currentPos);
-				currentPos+=16;
-				break;
-			}
-			case 22: { 
-				decode22(val, valn, out, currentPos);
-				currentPos+=14;
-				break;
-			}
-			case 23: {
-				decode23(val, valn, out, currentPos);
-				currentPos+=13;
-				break;
-			}
-			case 24: {
-				decode24(val, valn, out, currentPos);
-				currentPos+=12;
-				break;
-			}
-			case 25: {
-				decode25(val, valn, out, currentPos);
-				currentPos+=11;
-				break;
-			}
-			case 26: {
-				decode26(val, valn, out, currentPos);
-				currentPos+=10;
-				break;
-			}
-			case 27: {
-				decode27(val, valn, out, currentPos);
-				currentPos+=35;
-				break;
-			}
-			case 28: {
-				decode28(val, valn, out, currentPos);
-				currentPos+=21;
-				break;
-			}
-			case 29: { 
-				decode29(val, valn, out, currentPos);
-				currentPos+=16;
-				break;
-			}
-
-			case 30: {
-				decode30(val, valn, out, currentPos);
-				currentPos+=14;
-				break;
-			}
-			case 31: { 
-				decode31(val, valn, out, currentPos);
-				currentPos+=12;
-				break;
-			}
-			case 32: {
-				decode32(val, valn, out, currentPos);
-				currentPos+=11;
-				break;
-			}
-			case 33: {
-				decode33(val, valn, out, currentPos);
-				currentPos+=10;
-				break;
-			}
-			case 34: {
-				decode34(val, valn, out, currentPos);
-				currentPos+=9;
-				break;
-			}
-			case 35: {
-				decode35(val, valn, out, currentPos);
-				currentPos+=8;
-				break;
-			}
-			case 36: {
-				decode36(val, valn, out, currentPos);
-				currentPos+=33;
-				break;
-			}
-			case 37: {
-				decode37(val, valn, out, currentPos);
-				currentPos+=19;
-				break;
-			}
-			case 38: {
-				decode38(val, valn, out, currentPos);
-				currentPos+=14;
-				break;
-			}
-			case 39: {
-				decode39(val, valn, out, currentPos);
-				currentPos+=12;
-				break;
-			}
-			case 40: {
-				decode40(val, valn, out, currentPos);
-				currentPos+=10;
-				break;
-			}
-			case 41: {
-				decode41(val, valn, out, currentPos);
-				currentPos+=9;
-				break;
-			}
-			case 42: { 
-				decode42(val, valn, out, currentPos);
-				currentPos+=8;
-				break;
-			}
-			case 43: { 
-				decode43(val, valn, out, currentPos);
-				currentPos+=7;
-				break;
-			}
-			case 44: {
-				decode44(val, valn, out, currentPos);
-				currentPos+=6;
-				break;
-			}
-			case 45: {
-				decode45(val, valn, out, currentPos);
-				currentPos+=32;
-				break;
-			}
-			case 46: {
-				decode46(val, valn, out, currentPos);
-				currentPos+=18;
-				break;
-			}
-			case 47: { 
-				decode47(val, valn, out, currentPos);
-				currentPos+=13;
-				break;
-			}
-			case 48: {
-				decode48(val, valn, out, currentPos);
-				currentPos+=11;
-				break;
-			}
-			case 49: {
-				decode49(val, valn, out, currentPos);
-				currentPos+=9;
-				break;
-			}
-			case 50: {
-				decode50(val, valn, out, currentPos);
-				currentPos+=8;
-				break;
-			}
-			case 51: {
-				decode51(val, valn, out, currentPos);
-				currentPos+=7;
-				break;
-			}
-			case 52: { 
-				decode52(val, valn, out, currentPos);
-				currentPos+=6;
-				break;
-			}
-			case 53: {
-				decode53(val, valn, out, currentPos);
-				currentPos+=5;
-				break;
-			}
-			case 54: {
-				decode54(val, valn, out, currentPos);
-				currentPos+=31;
-				break;
-			}
-			case 55: {
-				decode55(val, valn, out, currentPos);
-				currentPos+=17;
-				break;
-			}
-			case 56: {
-				decode56(val, valn, out, currentPos);
-				currentPos+=12;
-				break;
-			}
-			case 57: {
-				decode57(val, valn, out, currentPos);
-				currentPos+=10;
-				break;
-			}
-			case 58: { 
-				decode58(val, valn, out, currentPos);
-				currentPos+=8;
-				break;
-			}
-			case 59: {
-				decode59(val, valn, out, currentPos);
-				currentPos+=7;
-				break;
-			}
-			case 60: {
-				decode60(val, valn, out, currentPos);
-				currentPos+=6;
-				break;
-			}
-			case 61: { 
-				decode61(val, valn, out, currentPos);
-				currentPos+=5;
-				break;
-			}
-			case 62: {
-				decode62(val, valn, out, currentPos);
-				currentPos+=4;
-				break;
-			}
-			case 63: {
-				decode63(val, valn, out, currentPos);
-				currentPos+=30;
-				break;
-			}
-			case 64: {
-				decode64(val, valn, out, currentPos);
-				currentPos+=16;
-				break;
-			}
-			case 65: { 
-				decode65(val, valn, out, currentPos);
-				currentPos+=11;
-				break;
-			}
-			case 66: { 
-				decode66(val, valn, out, currentPos);
-				currentPos+=9;
-				break;
-			}
-			case 67: {
-				decode67(val, valn, out, currentPos);
-				currentPos+=7;
-				break;
-			}
-			case 68: { 
-				decode68(val, valn, out, currentPos);
-				currentPos+=6;
-				break;
-			}
-			case 69: { 
-				decode69(val, valn, out, currentPos);
-				currentPos+=5;
-				break;
-			}
-			case 70: {
-				decode70(val, valn, out, currentPos);
-				currentPos+=4;
-				break;
-			}
-			case 71: {
-				decode71(val, valn, out, currentPos);
-				currentPos+=3;
-				break;
-			}
-			case 72: { 
-				decode72(val, valn, out, currentPos);
-				currentPos+=29;
-				break;
-			}
-			case 73: {
-				decode73(val, valn, out, currentPos);
-				currentPos+=15;
-				break;
-			}
-			case 74: {
-				decode74(val, valn, out, currentPos);
-				currentPos+=10;
-				break;
-			}
-			case 75: {
-				decode75(val, valn, out, currentPos);
-				currentPos+=8;
-				break;
-			}
-			case 76: {
-				decode76(val, valn, out, currentPos);
-				currentPos+=6;
-				break;
-			}
-			case 77: {
-				decode77(val, valn, out, currentPos);
-				currentPos+=5;
-				break;
-			}
-			case 78: {
-				decode78(val, valn, out, currentPos);
-				currentPos+=4;
-				break;
-			}
-			case 79: {
-				decode79(val, valn, out, currentPos);
-				currentPos+=3;
-				break;
-			}
-			case 80: {
-				decode80(val, valn, out, currentPos);
-				currentPos+=2;
-				break;
-			}
-			default:
-				throw new RuntimeException("Wrong code: " + header);
-			}// end switch
-		} // end while
-
-		while (currentPos < finalout) {
-			int val = in[tmpinpos++];
-			int header = val >>> 28;
-			switch (header) {
-			case 0: { // number : 28, bitwidth : 1
-				final int howmany = finalout - currentPos < 28 ? finalout - currentPos : 28;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (k + 4)) >>> 31;
-				}
-				break;
-			}
-			case 1: { // number : 14, bitwidth : 2
-				final int howmany = finalout - currentPos < 14 ? finalout - currentPos : 14;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (2 * k + 4)) >>> 30;
-				}
-				break;
-			}
-			case 2: { // number : 9, bitwidth : 3
-				final int howmany = finalout - currentPos < 9 ? finalout - currentPos : 9;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (3 * k + 5)) >>> 29;
-				}
-				break;
-			}
-			case 3: { // number : 7, bitwidth : 4
-				final int howmany = finalout - currentPos < 7 ? finalout - currentPos : 7;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (4 * k + 4)) >>> 28;
-				}
-				break;
-			}
-			case 4: { // number : 5, bitwidth : 5
-				final int howmany = finalout - currentPos < 5 ? finalout - currentPos : 5;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (5 * k + 7)) >>> 27;
-				}
-				break;
-			}
-			case 5: { // number : 4, bitwidth : 7
-				final int howmany = finalout - currentPos < 4 ? finalout - currentPos : 4;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (7 * k + 4)) >>> 25;
-				}
-				break;
-			}
-			case 6: { // number : 3, bitwidth : 9
-				final int howmany = finalout - currentPos < 3 ? finalout - currentPos : 3;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (9 * k + 5)) >>> 23;
-				}
-				break;
-			}
-			case 7: { // number : 2, bitwidth : 14
-				final int howmany = finalout - currentPos < 2 ? finalout - currentPos : 2;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (14 * k + 4)) >>> 18;
-				}
-				break;
-			}
-			case 8: { // number : 1, bitwidth : 28
-				out[currentPos++] = (val << 4) >>> 4;
-				break;
-			}
-			default: {
-				throw new RuntimeException("shouldn't happen");
-			}
-			}
-		}
-
-		outpos.set(finalout);
-		inpos.set(tmpinpos);
-		
-	}
-}
\ No newline at end of file
+    private static final int[][] M = { { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, { 9, 10, 11, 12, 13, 14, 15, 16, 17 },
+            { 18, 19, 20, 21, 22, 23, 24, 25, 26 }, { 27, 28, 29, 30, 31, 32, 33, 34, 35 },
+            { 36, 37, 38, 39, 40, 41, 42, 43, 44 }, { 45, 46, 47, 48, 49, 50, 51, 52, 53 },
+            { 54, 55, 56, 57, 58, 59, 60, 61, 62 }, { 63, 64, 65, 66, 67, 68, 69, 70, 71 },
+            { 72, 73, 74, 75, 76, 77, 78, 79, 80 } };
+
+    @Override
+    public void compress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        out[outpos.get()] = inlength;
+        outpos.increment();
+        headlessCompress(in, inpos, inlength, out, outpos);
+    }
+
+    private void encode0(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + (in[inf + i]);
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode1(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode2(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 28 + i];// 第二个28位是低位存储的，所以浪费的1比特在最顶端。
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode3(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode4(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode5(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode6(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode7(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode8(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 24; i++)
+            out[outf + 0] = (out[outf + 0] << 1) + in[inf + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 24 + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 28 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode9(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode10(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++) {
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+
+        }
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode11(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode12(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode13(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode14(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode15(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode16(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode17(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 12; i++)
+            out[outf + 0] = (out[outf + 0] << 2) + in[inf + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 12 + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 14 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode18(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 9 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode19(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 9 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode20(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 9 + i];
+
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode21(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 9 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode22(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 9 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode23(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 9 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode24(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 9 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode25(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 9 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode26(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 8; i++)
+            out[outf + 0] = (out[outf + 0] << 3) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 8 + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 9 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode27(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode28(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode29(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode30(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode31(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode32(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode33(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode34(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode35(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 6; i++)
+            out[outf + 0] = (out[outf + 0] << 4) + in[inf + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 6 + i];
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 7 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode36(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode37(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode38(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode39(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode40(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode41(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode42(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode43(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode44(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 4; i++)
+            out[outf + 0] = (out[outf + 0] << 5) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 4) + (in[inf + 4] >>> 1);
+        out[outf + 1] = (out[outf + 1] << 1) + ((in[inf + 4] << 31) >>> 31);
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 5 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode45(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode46(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode47(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode48(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode49(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode50(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode51(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode52(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode53(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 3; i++)
+            out[outf + 0] = (out[outf + 0] << 7) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 3) + (in[inf + 3] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 3] << 28) >>> 28);
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 4 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode54(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode55(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode56(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode57(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode58(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode59(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode60(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode61(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode62(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        for (int i = 0; i < 2; i++)
+            out[outf + 0] = (out[outf + 0] << 9) + in[inf + i];
+        out[outf + 0] = (out[outf + 0] << 6) + (in[inf + 2] >>> 3);
+        out[outf + 1] = (out[outf + 1] << 3) + ((in[inf + 2] << 29) >>> 29);
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 3 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode63(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode64(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode65(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode66(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode67(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode68(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode69(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode70(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode71(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 14) + in[inf];
+        out[outf + 0] = (out[outf + 0] << 10) + (in[inf + 1] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf + 1] << 28) >>> 28);
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 2 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode72(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 28; i++)
+            out[outf + 1] = (out[outf + 1] << 1) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode73(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 14; i++)
+            out[outf + 1] = (out[outf + 1] << 2) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode74(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 9; i++)
+            out[outf + 1] = (out[outf + 1] << 3) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode75(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 7; i++)
+            out[outf + 1] = (out[outf + 1] << 4) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode76(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 5; i++)
+            out[outf + 1] = (out[outf + 1] << 5) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode77(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 4; i++)
+            out[outf + 1] = (out[outf + 1] << 7) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode78(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 3; i++)
+            out[outf + 1] = (out[outf + 1] << 9) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode79(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 2; i++)
+            out[outf + 1] = (out[outf + 1] << 14) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    private void encode80(final int[] in, final int inf, final int code, final int[] out,
+            final int outf) {
+        out[outf + 0] = (out[outf + 0] << 24) + (in[inf] >>> 4);
+        out[outf + 1] = (out[outf + 1] << 4) + ((in[inf] << 28) >>> 28);
+        for (int i = 0; i < 1; i++)
+            out[outf + 1] = (out[outf + 1] << 28) + in[inf + 1 + i];
+        out[outf + 0] = code << 24 | out[outf + 0];
+        
+    }
+
+    @Override
+    public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        final int outlength = in[inpos.get()];
+        inpos.increment();
+        headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+    }
+
+    
+    
+    private void decode80(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode79(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
+        // number :2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode78(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 27);
+        // number : 3, bitwidth :9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode77(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode76(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 25);
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode75(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode74(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 27);
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode73(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode72(int val, int valn, int[] out, int currentPos) {
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (val << 8) >>> 4 | (valn >>> 28);
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode71(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode70(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode69(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 27);
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode68(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode67(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 25);
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode66(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode65(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 27);
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode64(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode63(int val, int valn, int[] out, int currentPos) {
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (val << 8) >>> 18;
+        out[currentPos++] = (val << 22) >>> 18 | (valn >>> 28);
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode62(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode61(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode60(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 27);
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode59(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode58(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 25);
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode57(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode56(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 27);
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode55(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode54(int val, int valn, int[] out, int currentPos) {
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (val << 8) >>> 23;
+        out[currentPos++] = (val << 17) >>> 23;
+        out[currentPos++] = (val << 26) >>> 23 | (valn >>> 28);
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode53(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode52(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode51(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 27);
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode50(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode49(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 25);
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode48(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode47(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 27);
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode46(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode45(int val, int valn, int[] out, int currentPos) {
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (val << 8) >>> 25;
+        out[currentPos++] = (val << 15) >>> 25;
+        out[currentPos++] = (val << 22) >>> 25;
+        out[currentPos++] = (val << 29) >>> 25 | (valn >>> 28);
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode44(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode43(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode42(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 27);
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode41(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode40(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 25);
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode39(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode38(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 27);
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode37(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode36(int val, int valn, int[] out, int currentPos) {
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (val << 8) >>> 27;
+        out[currentPos++] = (val << 13) >>> 27;
+        out[currentPos++] = (val << 18) >>> 27;
+        out[currentPos++] = (val << 23) >>> 27;
+        out[currentPos++] = (val << 28) >>> 27 | (valn >>> 28);
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode35(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 0) >>> 28;
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode34(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 0) >>> 28;
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode33(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 1) >>> 28;
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode32(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 0) >>> 28;
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode31(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 3) >>> 28;
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode30(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 0) >>> 28;
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode29(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 1) >>> 28;
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode28(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 0) >>> 28;
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode27(int val, int valn, int[] out, int currentPos) {
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (val << 8) >>> 28;
+        out[currentPos++] = (val << 12) >>> 28;
+        out[currentPos++] = (val << 16) >>> 28;
+        out[currentPos++] = (val << 20) >>> 28;
+        out[currentPos++] = (val << 24) >>> 28;
+        out[currentPos++] = (val << 28) >>> 28;
+        out[currentPos++] = (valn << 0) >>> 28;
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode26(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 1) >>> 29;
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode25(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 1) >>> 29;
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+    
+    private void decode24(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 2) >>> 29;
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+    
+    private void decode23(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 1) >>> 29;
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode22(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 4) >>> 29;
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+    }
+
+    private void decode21(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 1) >>> 29;
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+    }
+
+    private void decode20(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 2) >>> 29;
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+    }
+
+    private void decode19(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 1) >>> 29;
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode18(int val, int valn, int[] out, int currentPos) {
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (val << 8) >>> 29;
+        out[currentPos++] = (val << 11) >>> 29;
+        out[currentPos++] = (val << 14) >>> 29;
+        out[currentPos++] = (val << 17) >>> 29;
+        out[currentPos++] = (val << 20) >>> 29;
+        out[currentPos++] = (val << 23) >>> 29;
+        out[currentPos++] = (val << 26) >>> 29;
+        out[currentPos++] = (val << 29) >>> 29;
+        out[currentPos++] = (valn << 1) >>> 29;
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode17(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 0) >>> 30;
+        out[currentPos++] = (valn << 2) >>> 30;
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode16(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 0) >>> 30;
+        out[currentPos++] = (valn << 2) >>> 30;
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode15(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 1) >>> 30;
+        out[currentPos++] = (valn << 3) >>> 30;
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode14(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 0) >>> 30;
+        out[currentPos++] = (valn << 2) >>> 30;
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode13(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 3) >>> 30;
+        out[currentPos++] = (valn << 5) >>> 30;
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;
+        
+    }
+
+    private void decode12(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 0) >>> 30;
+        out[currentPos++] = (valn << 2) >>> 30;
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;
+        
+    }
+
+    private void decode11(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 1) >>> 30;
+        out[currentPos++] = (valn << 3) >>> 30;
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;
+        
+    }
+
+    private void decode10(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 0) >>> 30;
+        out[currentPos++] = (valn << 2) >>> 30;
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;
+    }
+
+    private void decode9(int val, int valn, int[] out, int currentPos) {
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (val << 8) >>> 30;
+        out[currentPos++] = (val << 10) >>> 30;
+        out[currentPos++] = (val << 12) >>> 30;
+        out[currentPos++] = (val << 14) >>> 30;
+        out[currentPos++] = (val << 16) >>> 30;
+        out[currentPos++] = (val << 18) >>> 30;
+        out[currentPos++] = (val << 20) >>> 30;
+        out[currentPos++] = (val << 22) >>> 30; // 10
+        out[currentPos++] = (val << 24) >>> 30;
+        out[currentPos++] = (val << 26) >>> 30;
+        out[currentPos++] = (val << 28) >>> 30;
+        out[currentPos++] = (val << 30) >>> 30;
+        out[currentPos++] = (valn << 0) >>> 30;
+        out[currentPos++] = (valn << 2) >>> 30;
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+    private void decode8(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = valn >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        // number : 1, bitwidth : 28
+        out[currentPos++] = (valn << 4) >>> 4;
+    }
+
+    private void decode7(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = valn >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        // number : 2, bitwidth : 14
+        out[currentPos++] = (valn << 4) >>> 18;
+        out[currentPos++] = (valn << 18) >>> 18;
+    }
+
+    private void decode6(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        out[currentPos++] = (valn << 4) >>> 31;
+        // number : 3, bitwidth : 9
+        out[currentPos++] = (valn << 5) >>> 23;
+        out[currentPos++] = (valn << 14) >>> 23;
+        out[currentPos++] = (valn << 23) >>> 23;
+    }
+
+    private void decode5(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = valn >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        // number : 4, bitwidth : 7
+        out[currentPos++] = (valn << 4) >>> 25;
+        out[currentPos++] = (valn << 11) >>> 25;
+        out[currentPos++] = (valn << 18) >>> 25;
+        out[currentPos++] = (valn << 25) >>> 25;
+    }
+
+    private void decode4(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;// 头部3bit
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        // number : 5, bitwidth : 5
+        out[currentPos++] = (valn << 7) >>> 27;
+        out[currentPos++] = (valn << 12) >>> 27;
+        out[currentPos++] = (valn << 17) >>> 27;
+        out[currentPos++] = (valn << 22) >>> 27;
+        out[currentPos++] = (valn << 27) >>> 27;    
+    }
+
+    private void decode3(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = valn >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        // number : 7, bitwidth : 4
+        out[currentPos++] = (valn << 4) >>> 28;
+        out[currentPos++] = (valn << 8) >>> 28;
+        out[currentPos++] = (valn << 12) >>> 28;
+        out[currentPos++] = (valn << 16) >>> 28;
+        out[currentPos++] = (valn << 20) >>> 28;
+        out[currentPos++] = (valn << 24) >>> 28;
+        out[currentPos++] = (valn << 28) >>> 28;        
+    }
+
+    private void decode2(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;// 头部1bit
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        out[currentPos++] = (valn << 4) >>> 31;
+        // number : 9, bitwidth : 3
+        out[currentPos++] = (valn << 5) >>> 29;
+        out[currentPos++] = (valn << 8) >>> 29;
+        out[currentPos++] = (valn << 11) >>> 29;
+        out[currentPos++] = (valn << 14) >>> 29;
+        out[currentPos++] = (valn << 17) >>> 29;
+        out[currentPos++] = (valn << 20) >>> 29;
+        out[currentPos++] = (valn << 23) >>> 29;
+        out[currentPos++] = (valn << 26) >>> 29;
+        out[currentPos++] = (valn << 29) >>> 29;    
+    }
+
+    private void decode1(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31;// 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = valn >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        // number : 14, bitwidth : 2
+        out[currentPos++] = (valn << 4) >>> 30;
+        out[currentPos++] = (valn << 6) >>> 30;
+        out[currentPos++] = (valn << 8) >>> 30;
+        out[currentPos++] = (valn << 10) >>> 30;
+        out[currentPos++] = (valn << 12) >>> 30;
+        out[currentPos++] = (valn << 14) >>> 30;
+        out[currentPos++] = (valn << 16) >>> 30;
+        out[currentPos++] = (valn << 18) >>> 30;
+        out[currentPos++] = (valn << 20) >>> 30;
+        out[currentPos++] = (valn << 22) >>> 30; // 10
+        out[currentPos++] = (valn << 24) >>> 30;
+        out[currentPos++] = (valn << 26) >>> 30;
+        out[currentPos++] = (valn << 28) >>> 30;
+        out[currentPos++] = (valn << 30) >>> 30;        
+    }
+
+    private void decode0(int val, int valn, int[] out, int currentPos) {
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (val << 8) >>> 31;
+        out[currentPos++] = (val << 9) >>> 31;
+        out[currentPos++] = (val << 10) >>> 31;
+        out[currentPos++] = (val << 11) >>> 31;
+        out[currentPos++] = (val << 12) >>> 31;
+        out[currentPos++] = (val << 13) >>> 31; // 10
+        out[currentPos++] = (val << 14) >>> 31;
+        out[currentPos++] = (val << 15) >>> 31;
+        out[currentPos++] = (val << 16) >>> 31;
+        out[currentPos++] = (val << 17) >>> 31;
+        out[currentPos++] = (val << 18) >>> 31;
+        out[currentPos++] = (val << 19) >>> 31;
+        out[currentPos++] = (val << 20) >>> 31;
+        out[currentPos++] = (val << 21) >>> 31;
+        out[currentPos++] = (val << 22) >>> 31;
+        out[currentPos++] = (val << 23) >>> 31; // 20
+        out[currentPos++] = (val << 24) >>> 31;
+        out[currentPos++] = (val << 25) >>> 31;
+        out[currentPos++] = (val << 26) >>> 31;
+        out[currentPos++] = (val << 27) >>> 31;
+        out[currentPos++] = (val << 28) >>> 31;
+        out[currentPos++] = (val << 29) >>> 31;
+        out[currentPos++] = (val << 30) >>> 31;
+        out[currentPos++] = (val << 31) >>> 31;
+        out[currentPos++] = valn >>> 31;
+        out[currentPos++] = (valn << 1) >>> 31;
+        out[currentPos++] = (valn << 2) >>> 31;
+        out[currentPos++] = (valn << 3) >>> 31;
+        // number : 28, bitwidth : 1
+        out[currentPos++] = (valn << 4) >>> 31;
+        out[currentPos++] = (valn << 5) >>> 31;
+        out[currentPos++] = (valn << 6) >>> 31;
+        out[currentPos++] = (valn << 7) >>> 31;
+        out[currentPos++] = (valn << 8) >>> 31;
+        out[currentPos++] = (valn << 9) >>> 31;
+        out[currentPos++] = (valn << 10) >>> 31;
+        out[currentPos++] = (valn << 11) >>> 31;
+        out[currentPos++] = (valn << 12) >>> 31;
+        out[currentPos++] = (valn << 13) >>> 31; // 10
+        out[currentPos++] = (valn << 14) >>> 31;
+        out[currentPos++] = (valn << 15) >>> 31;
+        out[currentPos++] = (valn << 16) >>> 31;
+        out[currentPos++] = (valn << 17) >>> 31;
+        out[currentPos++] = (valn << 18) >>> 31;
+        out[currentPos++] = (valn << 19) >>> 31;
+        out[currentPos++] = (valn << 20) >>> 31;
+        out[currentPos++] = (valn << 21) >>> 31;
+        out[currentPos++] = (valn << 22) >>> 31;
+        out[currentPos++] = (valn << 23) >>> 31; // 20
+        out[currentPos++] = (valn << 24) >>> 31;
+        out[currentPos++] = (valn << 25) >>> 31;
+        out[currentPos++] = (valn << 26) >>> 31;
+        out[currentPos++] = (valn << 27) >>> 31;
+        out[currentPos++] = (valn << 28) >>> 31;
+        out[currentPos++] = (valn << 29) >>> 31;
+        out[currentPos++] = (valn << 30) >>> 31;
+        out[currentPos++] = (valn << 31) >>> 31;
+    }
+
+
+    private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+
+    private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
+
+    @Override
+    public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+        int tmpoutpos = outpos.get();
+        int currentPos = inpos.get();
+        int selector1 = 0;
+        int selector2 = 0;
+        final int finalin = currentPos + inlength;
+        while (currentPos < finalin - 28 * 2) {
+            int nextCurrentPos = currentPos;
+                    mainloop1: for (selector1=0; selector1 <= 8; selector1++) {
+                int compressedNum = codeNum[selector1];
+                //if (finalin <= nextCurrentPos + compressedNum - 1)
+                //    compressedNum = finalin - nextCurrentPos;
+                int b = bitLength[selector1];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++) {
+                    if (Util.smallerorequalthan(max, in[nextCurrentPos + i]))
+                        continue mainloop1;
+                }
+                nextCurrentPos += compressedNum;
+                break;
+            }
+            mainloop2: for (selector2 = 0; selector2 <= 8; selector2++) {
+                int compressedNum = codeNum[selector2];
+                //if (finalin <= nextCurrentPos + compressedNum - 1)
+                //    compressedNum = finalin - nextCurrentPos;
+                int b = bitLength[selector2];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++) {
+                    if (Util.smallerorequalthan(max, in[nextCurrentPos + i]))
+                        continue mainloop2;
+                }
+                nextCurrentPos += compressedNum;
+                break;
+            }
+            int code = M[selector1][selector2];
+            out[tmpoutpos] = 0;
+            out[tmpoutpos + 1] = 0;
+            switch (code) {
+            case 0:
+                encode0(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 1:
+                encode1(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 2:
+                encode2(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 3:
+                encode3(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 4:
+                encode4(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 5:
+                encode5(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 6:
+                encode6(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 7:
+                encode7(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 8:
+                encode8(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 9:
+                encode9(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 10:
+                encode10(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 11:
+                encode11(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 12:
+                encode12(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 13:
+                encode13(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 14:
+                encode14(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 15:
+                encode15(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 16:
+                encode16(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 17:
+                encode17(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 18:
+                encode18(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 19:
+                encode19(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 20:
+                encode20(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 21:
+                encode21(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 22:
+                encode22(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 23:
+                encode23(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 24:
+                encode24(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 25:
+                encode25(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 26:
+                encode26(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 27:
+                encode27(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 28:
+                encode28(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 29:
+                encode29(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 30:
+                encode30(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 31:
+                encode31(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 32:
+                encode32(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 33:
+                encode33(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 34:
+                encode34(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 35:
+                encode35(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 36:
+                encode36(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 37:
+                encode37(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 38:
+                encode38(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 39:
+                encode39(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 40:
+                encode40(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 41:
+                encode41(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 42:
+                encode42(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 43:
+                encode43(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 44:
+                encode44(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 45:
+                encode45(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 46:
+                encode46(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 47:
+                encode47(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 48:
+                encode48(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 49:
+                encode49(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 50:
+                encode50(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 51:
+                encode51(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 52:
+                encode52(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 53:
+                encode53(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 54:
+                encode54(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 55:
+                encode55(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 56:
+                encode56(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 57:
+                encode57(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 58:
+                encode58(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 59:
+                encode59(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 60:
+                encode60(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 61:
+                encode61(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 62:
+                encode62(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 63:
+                encode63(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 64:
+                encode64(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 65:
+                encode65(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 66:
+                encode66(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 67:
+                encode67(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 68:
+                encode68(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 69:
+                encode69(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 70:
+                encode70(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 71:
+                encode71(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 72:
+                encode72(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 73:
+                encode73(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 74:
+                encode74(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 75:
+                encode75(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 76:
+                encode76(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 77:
+                encode77(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 78:
+                encode78(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 79:
+                encode79(in, currentPos, code, out, tmpoutpos);
+                break;
+            case 80:
+                encode80(in, currentPos, code, out, tmpoutpos);
+                break;
+            default:
+                throw new RuntimeException("unsupported code");
+            }// end switch
+            tmpoutpos += 2;
+            currentPos = nextCurrentPos;
+        }
+
+        outer: while (currentPos < finalin) {
+            mainloop: for (int selector = 0; selector < 8; selector++) {
+                int res = 0;
+                int compressedNum = codeNum[selector];
+                if (finalin <= currentPos + compressedNum - 1)
+                    compressedNum = finalin - currentPos;
+                int b = bitLength[selector];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++) {
+                    if (Util.smallerorequalthan(max, in[currentPos + i]))
+                        continue mainloop;
+                    res = (res << b) + in[currentPos + i];
+                }
+                if (compressedNum != codeNum[selector]) {
+                    res <<= (codeNum[selector] - compressedNum) * b;
+                }
+                res |= selector << 28;
+                out[tmpoutpos++] = res;
+
+                currentPos += compressedNum;
+                continue outer;
+            }
+            final int selector = 8;
+            out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+        }
+        inpos.set(currentPos);
+        outpos.set(tmpoutpos);
+    }
+
+    @Override
+    public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
+        int currentPos = outpos.get();
+        int tmpinpos = inpos.get();
+        final int finalout = currentPos + num;
+        while (currentPos < finalout - 2 * 28) {
+
+            int val = in[tmpinpos++];
+            int valn = in[tmpinpos++];
+            int header = val >>> 24;
+            switch (header) {
+            case 0: {
+                decode0(val, valn, out, currentPos);
+                currentPos+=56;
+                break;
+            }
+            case 1: { 
+                decode1(val, valn, out, currentPos);
+                currentPos+=42;
+                break;
+            }
+            case 2: {
+                decode2(val, valn, out, currentPos);
+                currentPos+=37;
+                break;
+            }
+            case 3: {
+                decode3(val, valn, out, currentPos);
+                currentPos+=35;        
+                break;
+            }
+            case 4: {
+                decode4(val, valn, out, currentPos);
+                currentPos+=33;
+                break;
+            }
+            case 5: {
+                decode5(val, valn, out, currentPos);
+                currentPos+=32;
+                break;
+            }
+            case 6: {
+                decode6(val, valn, out, currentPos);
+                currentPos+=31;
+                break;
+            }
+            case 7: {
+                decode7(val, valn, out, currentPos);
+                currentPos+=30;
+                break;
+            }
+            case 8: {
+                decode8(val, valn, out, currentPos);
+                currentPos+=29;
+                break;
+            }
+            case 9: {
+                decode9(val, valn, out, currentPos);
+                currentPos+=42;    
+                break;
+            }
+            case 10: {
+                decode10(val, valn, out, currentPos);
+                currentPos+=28;    
+                break;
+            }
+            case 11: { 
+                decode11(val, valn, out, currentPos);
+                currentPos+=23;
+                break;
+            }
+            case 12: {
+                decode12(val, valn, out, currentPos);
+                currentPos+=21;
+                break;
+            }
+            case 13: {
+                decode13(val, valn, out, currentPos);
+                currentPos+=19;
+                break;
+            }
+            case 14: {
+                decode14(val, valn, out, currentPos);
+                currentPos+=18;
+                break;
+            }
+            case 15: {
+                decode15(val, valn, out, currentPos);
+                currentPos+=17;
+                break;
+            }
+            case 16: {
+                decode16(val, valn, out, currentPos);
+                currentPos+=16;
+                break;
+            }
+            case 17: {
+                decode17(val, valn, out, currentPos);
+                currentPos+=15;
+                break;
+            }
+            case 18: {
+                decode18(val, valn, out, currentPos);
+                currentPos+=37;
+                break;
+            }
+            case 19: {
+                decode19(val, valn, out, currentPos);
+                currentPos+=23;
+                break;
+            }
+            case 20: {
+                decode20(val, valn, out, currentPos);
+                currentPos+=18;
+                break;
+            }
+            case 21: {
+                decode21(val, valn, out, currentPos);
+                currentPos+=16;
+                break;
+            }
+            case 22: { 
+                decode22(val, valn, out, currentPos);
+                currentPos+=14;
+                break;
+            }
+            case 23: {
+                decode23(val, valn, out, currentPos);
+                currentPos+=13;
+                break;
+            }
+            case 24: {
+                decode24(val, valn, out, currentPos);
+                currentPos+=12;
+                break;
+            }
+            case 25: {
+                decode25(val, valn, out, currentPos);
+                currentPos+=11;
+                break;
+            }
+            case 26: {
+                decode26(val, valn, out, currentPos);
+                currentPos+=10;
+                break;
+            }
+            case 27: {
+                decode27(val, valn, out, currentPos);
+                currentPos+=35;
+                break;
+            }
+            case 28: {
+                decode28(val, valn, out, currentPos);
+                currentPos+=21;
+                break;
+            }
+            case 29: { 
+                decode29(val, valn, out, currentPos);
+                currentPos+=16;
+                break;
+            }
+
+            case 30: {
+                decode30(val, valn, out, currentPos);
+                currentPos+=14;
+                break;
+            }
+            case 31: { 
+                decode31(val, valn, out, currentPos);
+                currentPos+=12;
+                break;
+            }
+            case 32: {
+                decode32(val, valn, out, currentPos);
+                currentPos+=11;
+                break;
+            }
+            case 33: {
+                decode33(val, valn, out, currentPos);
+                currentPos+=10;
+                break;
+            }
+            case 34: {
+                decode34(val, valn, out, currentPos);
+                currentPos+=9;
+                break;
+            }
+            case 35: {
+                decode35(val, valn, out, currentPos);
+                currentPos+=8;
+                break;
+            }
+            case 36: {
+                decode36(val, valn, out, currentPos);
+                currentPos+=33;
+                break;
+            }
+            case 37: {
+                decode37(val, valn, out, currentPos);
+                currentPos+=19;
+                break;
+            }
+            case 38: {
+                decode38(val, valn, out, currentPos);
+                currentPos+=14;
+                break;
+            }
+            case 39: {
+                decode39(val, valn, out, currentPos);
+                currentPos+=12;
+                break;
+            }
+            case 40: {
+                decode40(val, valn, out, currentPos);
+                currentPos+=10;
+                break;
+            }
+            case 41: {
+                decode41(val, valn, out, currentPos);
+                currentPos+=9;
+                break;
+            }
+            case 42: { 
+                decode42(val, valn, out, currentPos);
+                currentPos+=8;
+                break;
+            }
+            case 43: { 
+                decode43(val, valn, out, currentPos);
+                currentPos+=7;
+                break;
+            }
+            case 44: {
+                decode44(val, valn, out, currentPos);
+                currentPos+=6;
+                break;
+            }
+            case 45: {
+                decode45(val, valn, out, currentPos);
+                currentPos+=32;
+                break;
+            }
+            case 46: {
+                decode46(val, valn, out, currentPos);
+                currentPos+=18;
+                break;
+            }
+            case 47: { 
+                decode47(val, valn, out, currentPos);
+                currentPos+=13;
+                break;
+            }
+            case 48: {
+                decode48(val, valn, out, currentPos);
+                currentPos+=11;
+                break;
+            }
+            case 49: {
+                decode49(val, valn, out, currentPos);
+                currentPos+=9;
+                break;
+            }
+            case 50: {
+                decode50(val, valn, out, currentPos);
+                currentPos+=8;
+                break;
+            }
+            case 51: {
+                decode51(val, valn, out, currentPos);
+                currentPos+=7;
+                break;
+            }
+            case 52: { 
+                decode52(val, valn, out, currentPos);
+                currentPos+=6;
+                break;
+            }
+            case 53: {
+                decode53(val, valn, out, currentPos);
+                currentPos+=5;
+                break;
+            }
+            case 54: {
+                decode54(val, valn, out, currentPos);
+                currentPos+=31;
+                break;
+            }
+            case 55: {
+                decode55(val, valn, out, currentPos);
+                currentPos+=17;
+                break;
+            }
+            case 56: {
+                decode56(val, valn, out, currentPos);
+                currentPos+=12;
+                break;
+            }
+            case 57: {
+                decode57(val, valn, out, currentPos);
+                currentPos+=10;
+                break;
+            }
+            case 58: { 
+                decode58(val, valn, out, currentPos);
+                currentPos+=8;
+                break;
+            }
+            case 59: {
+                decode59(val, valn, out, currentPos);
+                currentPos+=7;
+                break;
+            }
+            case 60: {
+                decode60(val, valn, out, currentPos);
+                currentPos+=6;
+                break;
+            }
+            case 61: { 
+                decode61(val, valn, out, currentPos);
+                currentPos+=5;
+                break;
+            }
+            case 62: {
+                decode62(val, valn, out, currentPos);
+                currentPos+=4;
+                break;
+            }
+            case 63: {
+                decode63(val, valn, out, currentPos);
+                currentPos+=30;
+                break;
+            }
+            case 64: {
+                decode64(val, valn, out, currentPos);
+                currentPos+=16;
+                break;
+            }
+            case 65: { 
+                decode65(val, valn, out, currentPos);
+                currentPos+=11;
+                break;
+            }
+            case 66: { 
+                decode66(val, valn, out, currentPos);
+                currentPos+=9;
+                break;
+            }
+            case 67: {
+                decode67(val, valn, out, currentPos);
+                currentPos+=7;
+                break;
+            }
+            case 68: { 
+                decode68(val, valn, out, currentPos);
+                currentPos+=6;
+                break;
+            }
+            case 69: { 
+                decode69(val, valn, out, currentPos);
+                currentPos+=5;
+                break;
+            }
+            case 70: {
+                decode70(val, valn, out, currentPos);
+                currentPos+=4;
+                break;
+            }
+            case 71: {
+                decode71(val, valn, out, currentPos);
+                currentPos+=3;
+                break;
+            }
+            case 72: { 
+                decode72(val, valn, out, currentPos);
+                currentPos+=29;
+                break;
+            }
+            case 73: {
+                decode73(val, valn, out, currentPos);
+                currentPos+=15;
+                break;
+            }
+            case 74: {
+                decode74(val, valn, out, currentPos);
+                currentPos+=10;
+                break;
+            }
+            case 75: {
+                decode75(val, valn, out, currentPos);
+                currentPos+=8;
+                break;
+            }
+            case 76: {
+                decode76(val, valn, out, currentPos);
+                currentPos+=6;
+                break;
+            }
+            case 77: {
+                decode77(val, valn, out, currentPos);
+                currentPos+=5;
+                break;
+            }
+            case 78: {
+                decode78(val, valn, out, currentPos);
+                currentPos+=4;
+                break;
+            }
+            case 79: {
+                decode79(val, valn, out, currentPos);
+                currentPos+=3;
+                break;
+            }
+            case 80: {
+                decode80(val, valn, out, currentPos);
+                currentPos+=2;
+                break;
+            }
+            default:
+                throw new RuntimeException("Wrong code: " + header);
+            }// end switch
+        } // end while
+
+        while (currentPos < finalout) {
+            int val = in[tmpinpos++];
+            int header = val >>> 28;
+            switch (header) {
+            case 0: { // number : 28, bitwidth : 1
+                final int howmany = finalout - currentPos < 28 ? finalout - currentPos : 28;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (k + 4)) >>> 31;
+                }
+                break;
+            }
+            case 1: { // number : 14, bitwidth : 2
+                final int howmany = finalout - currentPos < 14 ? finalout - currentPos : 14;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+                }
+                break;
+            }
+            case 2: { // number : 9, bitwidth : 3
+                final int howmany = finalout - currentPos < 9 ? finalout - currentPos : 9;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+                }
+                break;
+            }
+            case 3: { // number : 7, bitwidth : 4
+                final int howmany = finalout - currentPos < 7 ? finalout - currentPos : 7;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+                }
+                break;
+            }
+            case 4: { // number : 5, bitwidth : 5
+                final int howmany = finalout - currentPos < 5 ? finalout - currentPos : 5;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+                }
+                break;
+            }
+            case 5: { // number : 4, bitwidth : 7
+                final int howmany = finalout - currentPos < 4 ? finalout - currentPos : 4;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+                }
+                break;
+            }
+            case 6: { // number : 3, bitwidth : 9
+                final int howmany = finalout - currentPos < 3 ? finalout - currentPos : 3;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+                }
+                break;
+            }
+            case 7: { // number : 2, bitwidth : 14
+                final int howmany = finalout - currentPos < 2 ? finalout - currentPos : 2;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+                }
+                break;
+            }
+            case 8: { // number : 1, bitwidth : 28
+                out[currentPos++] = (val << 4) >>> 4;
+                break;
+            }
+            default: {
+                throw new RuntimeException("shouldn't happen");
+            }
+            }
+        }
+
+        outpos.set(finalout);
+        inpos.set(tmpinpos);
+        
+    }
+
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        compressedPositions.add(inlength);
+        return inlength;
+    }
+}
diff --git a/src/main/java/me/lemire/integercompression/IntCompressor.java b/src/main/java/me/lemire/integercompression/IntCompressor.java
index 87e7bde..30f755c 100644
--- a/src/main/java/me/lemire/integercompression/IntCompressor.java
+++ b/src/main/java/me/lemire/integercompression/IntCompressor.java
@@ -33,20 +33,14 @@ public IntCompressor() {
      * 
      * @param input array to be compressed
      * @return compressed array
-     * @throws UncompressibleInputException if the data is too poorly compressible
      */
     public  int[] compress(int[] input) {
-        int [] compressed = new int[input.length + input.length / 100 + 1024];
+        int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length);
+        int[] compressed = new int[maxCompressedLength + 1]; // +1 to store the length of the input
+        // Store at index=0 the length of the input, hence enabling .headlessCompress
         compressed[0] = input.length;
         IntWrapper outpos = new IntWrapper(1);
-        try {
-            codec.headlessCompress(input, new IntWrapper(0),
-                    input.length, compressed, outpos);
-        } catch (IndexOutOfBoundsException ioebe) {
-            throw new
-            UncompressibleInputException("Your input is too poorly compressible "
-                    + "with the current codec : "+codec);
-        }
+        codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos);
         compressed = Arrays.copyOf(compressed,outpos.intValue());
         return compressed;
     }
@@ -58,6 +52,7 @@ public  int[] compress(int[] input) {
      * @return uncompressed array
      */
     public int[] uncompress(int[] compressed) {
+        // Read at index=0 the length of the input, hence enabling .headlessUncompress
         int[] decompressed = new int[compressed[0]];
         IntWrapper inpos = new IntWrapper(1);
         codec.headlessUncompress(compressed, inpos, 
diff --git a/src/main/java/me/lemire/integercompression/IntegerCODEC.java b/src/main/java/me/lemire/integercompression/IntegerCODEC.java
index 7929e48..1dd9a4c 100644
--- a/src/main/java/me/lemire/integercompression/IntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/IntegerCODEC.java
@@ -18,14 +18,14 @@ public interface IntegerCODEC {
          * Compress data from an array to another array.
          * 
          * Both inpos and outpos are modified to represent how much data was
-         * read and written to if 12 ints (inlength = 12) are compressed to 3
+         * read and written to. If 12 ints (inlength = 12) are compressed to 3
          * ints, then inpos will be incremented by 12 while outpos will be
-         * incremented by 3 we use IntWrapper to pass the values by reference.
+         * incremented by 3. We use IntWrapper to pass the values by reference.
          * 
          * @param in
          *                input array
          * @param inpos
-         *                location in the input array
+         *                where to start reading in the array
          * @param inlength
          *                how many integers to compress
          * @param out
@@ -52,7 +52,7 @@ public void compress(int[] in, IntWrapper inpos, int inlength,
          * @param out
          *                array where to write the compressed output
          * @param outpos
-         *                where to write the compressed output in out
+         *                where to start writing the uncompressed output in out
          */
         public void uncompress(int[] in, IntWrapper inpos, int inlength,
                 int[] out, IntWrapper outpos);
diff --git a/src/main/java/me/lemire/integercompression/JustCopy.java b/src/main/java/me/lemire/integercompression/JustCopy.java
index 709b86a..f57282c 100644
--- a/src/main/java/me/lemire/integercompression/JustCopy.java
+++ b/src/main/java/me/lemire/integercompression/JustCopy.java
@@ -42,6 +42,12 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
             
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+            compressedPositions.add(inlength);
+            return inlength;
+        }
+
         @Override
         public void compress(int[] in, IntWrapper inpos, int inlength,
                 int[] out, IntWrapper outpos) {
diff --git a/src/main/java/me/lemire/integercompression/Kamikaze.java b/src/main/java/me/lemire/integercompression/Kamikaze.java
index fd1ac82..4cab30b 100644
--- a/src/main/java/me/lemire/integercompression/Kamikaze.java
+++ b/src/main/java/me/lemire/integercompression/Kamikaze.java
@@ -38,6 +38,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
         }
     }
 
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        throw new UnsupportedOperationException("Calculating the max compressed length is not supported yet.");
+    }
+
     @Override
     public String toString() {
         return "Kamikaze's PForDelta";
@@ -64,4 +69,4 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
         headlessUncompress(in, inpos, inlength, out, outpos, outlength);
 
     }
-}
\ No newline at end of file
+}
diff --git a/src/main/java/me/lemire/integercompression/NewPFD.java b/src/main/java/me/lemire/integercompression/NewPFD.java
index 6dd01aa..3da3002 100644
--- a/src/main/java/me/lemire/integercompression/NewPFD.java
+++ b/src/main/java/me/lemire/integercompression/NewPFD.java
@@ -132,6 +132,17 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 decodePage(in, inpos, out, outpos, mynvalue);
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+                inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+                int blockCount = inlength / BLOCK_SIZE;
+                // +1 for the header
+                // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+                int maxBlockSizeInInts = 1 + BLOCK_SIZE;
+                compressedPositions.add(inlength);
+                return maxBlockSizeInInts * blockCount;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 int tmpoutpos = outpos.get();
diff --git a/src/main/java/me/lemire/integercompression/NewPFDS16.java b/src/main/java/me/lemire/integercompression/NewPFDS16.java
index 98370d2..526b8fb 100644
--- a/src/main/java/me/lemire/integercompression/NewPFDS16.java
+++ b/src/main/java/me/lemire/integercompression/NewPFDS16.java
@@ -131,6 +131,17 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 decodePage(in, inpos, out, outpos, mynvalue);
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+                inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+                int blockCount = inlength / BLOCK_SIZE;
+                // +1 for the header
+                // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+                int maxBlockSizeInInts = 1 + BLOCK_SIZE;
+                compressedPositions.add(inlength);
+                return maxBlockSizeInInts * blockCount;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 int tmpoutpos = outpos.get();
diff --git a/src/main/java/me/lemire/integercompression/NewPFDS9.java b/src/main/java/me/lemire/integercompression/NewPFDS9.java
index c8389c1..bd802b6 100644
--- a/src/main/java/me/lemire/integercompression/NewPFDS9.java
+++ b/src/main/java/me/lemire/integercompression/NewPFDS9.java
@@ -130,6 +130,17 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 decodePage(in, inpos, out, outpos, mynvalue);
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+                inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+                int blockCount = inlength / BLOCK_SIZE;
+                // +1 for the header
+                // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+                int maxBlockSizeInInts = 1 + BLOCK_SIZE;
+                compressedPositions.add(inlength);
+                return maxBlockSizeInInts * blockCount;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 int tmpoutpos = outpos.get();
diff --git a/src/main/java/me/lemire/integercompression/OptPFD.java b/src/main/java/me/lemire/integercompression/OptPFD.java
index 8c90586..cfda92e 100644
--- a/src/main/java/me/lemire/integercompression/OptPFD.java
+++ b/src/main/java/me/lemire/integercompression/OptPFD.java
@@ -147,6 +147,17 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 decodePage(in, inpos, out, outpos, mynvalue);
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+                inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+                int blockCount = inlength / BLOCK_SIZE;
+                // +1 for the header
+                // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+                int maxBlockSizeInInts = 1 + BLOCK_SIZE;
+                compressedPositions.add(inlength);
+                return maxBlockSizeInInts * blockCount;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 int tmpoutpos = outpos.get();
diff --git a/src/main/java/me/lemire/integercompression/OptPFDS16.java b/src/main/java/me/lemire/integercompression/OptPFDS16.java
index 8574b10..95c4f62 100644
--- a/src/main/java/me/lemire/integercompression/OptPFDS16.java
+++ b/src/main/java/me/lemire/integercompression/OptPFDS16.java
@@ -147,6 +147,17 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 decodePage(in, inpos, out, outpos, mynvalue);
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+                inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+                int blockCount = inlength / BLOCK_SIZE;
+                // +1 for the header
+                // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+                int maxBlockSizeInInts = 1 + BLOCK_SIZE;
+                compressedPositions.add(inlength);
+                return maxBlockSizeInInts * blockCount;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 int tmpoutpos = outpos.get();
@@ -197,4 +208,4 @@ public String toString() {
                 return this.getClass().getSimpleName();
         }
 
-}
\ No newline at end of file
+}
diff --git a/src/main/java/me/lemire/integercompression/OptPFDS9.java b/src/main/java/me/lemire/integercompression/OptPFDS9.java
index 34f4206..0e2563b 100644
--- a/src/main/java/me/lemire/integercompression/OptPFDS9.java
+++ b/src/main/java/me/lemire/integercompression/OptPFDS9.java
@@ -146,6 +146,17 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
                 decodePage(in, inpos, out, outpos, mynvalue);
         }
 
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+                inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+                int blockCount = inlength / BLOCK_SIZE;
+                // +1 for the header
+                // getBestBFromData limits the memory used for exceptions so that the total size of the block does not exceed BLOCK_SIZE integers.
+                int maxBlockSizeInInts = 1 + BLOCK_SIZE;
+                compressedPositions.add(inlength);
+                return maxBlockSizeInInts * blockCount;
+        }
+
         private void decodePage(int[] in, IntWrapper inpos, int[] out,
                 IntWrapper outpos, int thissize) {
                 int tmpoutpos = outpos.get();
@@ -197,4 +208,4 @@ public String toString() {
                 return this.getClass().getSimpleName();
         }
 
-}
\ No newline at end of file
+}
diff --git a/src/main/java/me/lemire/integercompression/S16.java b/src/main/java/me/lemire/integercompression/S16.java
index 08ffbc4..e40522d 100644
--- a/src/main/java/me/lemire/integercompression/S16.java
+++ b/src/main/java/me/lemire/integercompression/S16.java
@@ -15,191 +15,191 @@
  */
 public final class S16 {
 
-	/**
-	 * Compress an integer array using Simple16
-	 *
-	 * 
-	 * @param in
-	 *            array to compress
-	 * @param currentPos
-	 *            where to start reading
-	 * @param inlength
-	 *            how many integers to read
-	 * @param out
-	 *            output array
-	 * @param tmpoutpos
-	 *            location in the output array
-	 * @return the number of 32-bit words written (in compressed form)
-	 */
-	public static int compress(final int[] in, int currentPos, int inlength, final int out[], final int tmpoutpos) {
-		int outpos = tmpoutpos;
-		final int finalin = currentPos + inlength;
-		while (currentPos < finalin) {
-			int inoffset = compressblock(out, outpos++, in, currentPos, inlength);
-			if (inoffset == -1)
-				throw new RuntimeException("Too big a number");
-			currentPos += inoffset;
-			inlength -= inoffset;
-		}
-		return outpos - tmpoutpos;
-	}
-
-	/**
-	 * Estimate size of the compressed output.
-	 * 
-	 * @param in
-	 *            array to compress
-	 * @param currentPos
-	 *            where to start reading
-	 * @param inlength
-	 *            how many integers to read
-	 * @return estimated size of the output (in 32-bit integers)
-	 */
-	public static int estimatecompress(final int[] in, int currentPos, int inlength) {
-		final int finalin = currentPos + inlength;
-		int counter = 0;
-		while (currentPos < finalin) {
-			int inoffset = fakecompressblock(in, currentPos, inlength);
-			if (inoffset == -1)
-				throw new RuntimeException("Too big a number");
-			currentPos += inoffset;
-			inlength -= inoffset;
-			++counter;
-		}
-		return counter;
-	}
-
-	/**
-	 * Compress an integer array using Simple16
-	 * 
-	 * @param out
-	 *            the compressed output
-	 * @param outOffset
-	 *            the offset of the output in the number of integers
-	 * @param in
-	 *            the integer input array
-	 * @param inOffset
-	 *            the offset of the input in the number of integers
-	 * @param n
-	 *            the number of elements to be compressed
-	 * @return the size of the outputs in 32-bit integers
-	 * 
-	 */
-	public static final int compressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
-		int numIdx, j, num, bits;
-		for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
-			out[outOffset] = numIdx << S16_BITSSIZE;
-			num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
-
-			for (j = 0, bits = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
-				out[outOffset] |= (in[inOffset + j] << bits);
-				bits += S16_BITS[numIdx][j];
-				j++;
-			}
-
-			if (j == num) {
-				return num;
-			}
-		}
-
-		return -1;
-	}
-
-	private static final int fakecompressblock(int[] in, int inOffset, int n) {
-		int numIdx, j, num;
-		for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
-			num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
-
-			for (j = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
-				j++;
-			}
-
-			if (j == num) {
-				return num;
-			}
-		}
-
-		return -1;
-	}
-
-	/**
-	 * Decompress an integer array using Simple16
-	 * 
-	 * @param out
-	 *            the decompressed output
-	 * @param outOffset
-	 *            the offset of the output in the number of integers
-	 * @param in
-	 *            the compressed input array
-	 * @param inOffset
-	 *            the offset of the input in the number of integers
-	 * @param n
-	 *            the number of elements to be compressed
-	 * @return the number of processed integers
-	 */
-	public static final int decompressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
-		int numIdx, j = 0, bits = 0;
-		numIdx = in[inOffset] >>> S16_BITSSIZE;
-		int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n;
-		for (j = 0, bits = 0; j < num; j++) {
-			out[outOffset + j] = (in[inOffset] >>> bits) & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
-			bits += S16_BITS[numIdx][j];
-		}
-		return num;
-	}
-
-	/**
-	 * Uncompressed data from an input array into an output array
-	 *
-	 * @param in
-	 *            input array (in compressed form)
-	 * @param tmpinpos
-	 *            starting location in the compressed input array
-	 * @param inlength
-	 *            how much data we wish the read (in 32-bit words)
-	 * @param out
-	 *            output array (in decompressed form)
-	 * @param currentPos
-	 *            current position in the output array
-	 * @param outlength
-	 *            available data in the output array
-	 */
-	public static void uncompress(final int[] in, int tmpinpos, final int inlength, final int[] out, int currentPos,
-			int outlength) {
-		final int finalpos = tmpinpos + inlength;
-		while (tmpinpos < finalpos) {
-			final int howmany = decompressblock(out, currentPos, in, tmpinpos, outlength);
-			outlength -= howmany;
-			currentPos += howmany;
-			tmpinpos += 1;
-		}
-
-	}
-
-	private static int[][] shiftme(int[][] x) {
-		int[][] answer = new int[x.length][];
-		for (int k = 0; k < x.length; ++k) {
-			answer[k] = new int[x[k].length];
-			for (int z = 0; z < answer[k].length; ++z)
-				answer[k][z] = 1 << x[k][z];
-		}
-		return answer;
-	}
-
-	private static final int S16_NUMSIZE = 16;
-	private static final int S16_BITSSIZE = 28;
-	// the possible number of bits used to represent one integer
-	private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1 };
-	// the corresponding number of elements for each value of the number of
-	// bits
-	private static final int[][] S16_BITS = {
-			{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-			{ 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-			{ 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 },
-			{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2 },
-			{ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
-			{ 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 }, { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
-			{ 7, 7, 7, 7 }, { 10, 9, 9, }, { 14, 14 }, { 28 } };
-	private static final int[][] SHIFTED_S16_BITS = shiftme(S16_BITS);
+    /**
+     * Compress an integer array using Simple16
+     *
+     * 
+     * @param in
+     *            array to compress
+     * @param currentPos
+     *            where to start reading
+     * @param inlength
+     *            how many integers to read
+     * @param out
+     *            output array
+     * @param tmpoutpos
+     *            location in the output array
+     * @return the number of 32-bit words written (in compressed form)
+     */
+    public static int compress(final int[] in, int currentPos, int inlength, final int out[], final int tmpoutpos) {
+        int outpos = tmpoutpos;
+        final int finalin = currentPos + inlength;
+        while (currentPos < finalin) {
+            int inoffset = compressblock(out, outpos++, in, currentPos, inlength);
+            if (inoffset == -1)
+                throw new RuntimeException("Too big a number");
+            currentPos += inoffset;
+            inlength -= inoffset;
+        }
+        return outpos - tmpoutpos;
+    }
+
+    /**
+     * Estimate size of the compressed output.
+     * 
+     * @param in
+     *            array to compress
+     * @param currentPos
+     *            where to start reading
+     * @param inlength
+     *            how many integers to read
+     * @return estimated size of the output (in 32-bit integers)
+     */
+    public static int estimatecompress(final int[] in, int currentPos, int inlength) {
+        final int finalin = currentPos + inlength;
+        int counter = 0;
+        while (currentPos < finalin) {
+            int inoffset = fakecompressblock(in, currentPos, inlength);
+            if (inoffset == -1)
+                throw new RuntimeException("Too big a number");
+            currentPos += inoffset;
+            inlength -= inoffset;
+            ++counter;
+        }
+        return counter;
+    }
+
+    /**
+     * Compress an integer array using Simple16
+     * 
+     * @param out
+     *            the compressed output
+     * @param outOffset
+     *            the offset of the output in the number of integers
+     * @param in
+     *            the integer input array
+     * @param inOffset
+     *            the offset of the input in the number of integers
+     * @param n
+     *            the number of elements to be compressed
+     * @return the size of the outputs in 32-bit integers
+     * 
+     */
+    public static final int compressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
+        int numIdx, j, num, bits;
+        for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
+            out[outOffset] = numIdx << S16_BITSSIZE;
+            num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
+
+            for (j = 0, bits = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
+                out[outOffset] |= (in[inOffset + j] << bits);
+                bits += S16_BITS[numIdx][j];
+                j++;
+            }
+
+            if (j == num) {
+                return num;
+            }
+        }
+
+        return -1;
+    }
+
+    private static final int fakecompressblock(int[] in, int inOffset, int n) {
+        int numIdx, j, num;
+        for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
+            num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
+
+            for (j = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
+                j++;
+            }
+
+            if (j == num) {
+                return num;
+            }
+        }
+
+        return -1;
+    }
+
+    /**
+     * Decompress an integer array using Simple16
+     * 
+     * @param out
+     *            the decompressed output
+     * @param outOffset
+     *            the offset of the output in the number of integers
+     * @param in
+     *            the compressed input array
+     * @param inOffset
+     *            the offset of the input in the number of integers
+     * @param n
+     *            the number of elements to be compressed
+     * @return the number of processed integers
+     */
+    public static final int decompressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
+        int numIdx, j = 0, bits = 0;
+        numIdx = in[inOffset] >>> S16_BITSSIZE;
+        int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n;
+        for (j = 0, bits = 0; j < num; j++) {
+            out[outOffset + j] = (in[inOffset] >>> bits) & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
+            bits += S16_BITS[numIdx][j];
+        }
+        return num;
+    }
+
+    /**
+     * Uncompressed data from an input array into an output array
+     *
+     * @param in
+     *            input array (in compressed form)
+     * @param tmpinpos
+     *            starting location in the compressed input array
+     * @param inlength
+     *            how much data we wish the read (in 32-bit words)
+     * @param out
+     *            output array (in decompressed form)
+     * @param currentPos
+     *            current position in the output array
+     * @param outlength
+     *            available data in the output array
+     */
+    public static void uncompress(final int[] in, int tmpinpos, final int inlength, final int[] out, int currentPos,
+            int outlength) {
+        final int finalpos = tmpinpos + inlength;
+        while (tmpinpos < finalpos) {
+            final int howmany = decompressblock(out, currentPos, in, tmpinpos, outlength);
+            outlength -= howmany;
+            currentPos += howmany;
+            tmpinpos += 1;
+        }
+
+    }
+
+    private static int[][] shiftme(int[][] x) {
+        int[][] answer = new int[x.length][];
+        for (int k = 0; k < x.length; ++k) {
+            answer[k] = new int[x[k].length];
+            for (int z = 0; z < answer[k].length; ++z)
+                answer[k][z] = 1 << x[k][z];
+        }
+        return answer;
+    }
+
+    private static final int S16_NUMSIZE = 16;
+    private static final int S16_BITSSIZE = 28;
+    // the possible number of bits used to represent one integer
+    private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1 };
+    // the corresponding number of elements for each value of the number of
+    // bits
+    private static final int[][] S16_BITS = {
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 },
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2 },
+            { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
+            { 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 }, { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
+            { 7, 7, 7, 7 }, { 10, 9, 9, }, { 14, 14 }, { 28 } };
+    private static final int[][] SHIFTED_S16_BITS = shiftme(S16_BITS);
 
 }
diff --git a/src/main/java/me/lemire/integercompression/S9.java b/src/main/java/me/lemire/integercompression/S9.java
index 2180e5a..7e03e42 100644
--- a/src/main/java/me/lemire/integercompression/S9.java
+++ b/src/main/java/me/lemire/integercompression/S9.java
@@ -17,187 +17,187 @@
 public final class S9 {
 
 
-	/**
-	 * Estimate size of the compressed output.
-	 * 
-	 * @param in
-	 *            array to compress
-	 * @param currentPos
-	 *            where to start reading
-	 * @param inlength
-	 *            how many integers to read
-	 * @return estimated size of the output (in 32-bit integers)
-	 */
-	public static int estimatecompress(int[] in, int currentPos, int inlength) {
-		int tmpoutpos = 0;
-		int finalpos = currentPos + inlength;
-		outer: while (currentPos < finalpos) {
-			mainloop: for (int selector = 0; selector < 8; selector++) {
+    /**
+     * Estimate size of the compressed output.
+     * 
+     * @param in
+     *            array to compress
+     * @param currentPos
+     *            where to start reading
+     * @param inlength
+     *            how many integers to read
+     * @return estimated size of the output (in 32-bit integers)
+     */
+    public static int estimatecompress(int[] in, int currentPos, int inlength) {
+        int tmpoutpos = 0;
+        int finalpos = currentPos + inlength;
+        outer: while (currentPos < finalpos) {
+            mainloop: for (int selector = 0; selector < 8; selector++) {
 
-				int compressedNum = codeNum[selector];
-				if (finalpos <= currentPos + compressedNum - 1)
-					compressedNum = finalpos - currentPos;
-				int b = bitLength[selector];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++)
-					if (Util.smallerorequalthan(max , in[currentPos + i]))
-						continue mainloop;
-				currentPos += compressedNum;
-				++tmpoutpos;
-				continue outer;
-			}
-			final int selector = 8;
-			if (in[currentPos] >= 1 << bitLength[selector])
-				throw new RuntimeException("Too big a number");
-			tmpoutpos++;
-			currentPos++;
+                int compressedNum = codeNum[selector];
+                if (finalpos <= currentPos + compressedNum - 1)
+                    compressedNum = finalpos - currentPos;
+                int b = bitLength[selector];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++)
+                    if (Util.smallerorequalthan(max , in[currentPos + i]))
+                        continue mainloop;
+                currentPos += compressedNum;
+                ++tmpoutpos;
+                continue outer;
+            }
+            final int selector = 8;
+            if (in[currentPos] >= 1 << bitLength[selector])
+                throw new RuntimeException("Too big a number");
+            tmpoutpos++;
+            currentPos++;
 
-		}
-		return tmpoutpos;
-	}
+        }
+        return tmpoutpos;
+    }
 
-	/**
-	 * Compress an integer array using Simple9
-	 *
-	 * 
-	 * @param in
-	 *            array to compress
-	 * @param currentPos
-	 *            where to start reading
-	 * @param inlength
-	 *            how many integers to read
-	 * @param out
-	 *            output array
-	 * @param tmpoutpos
-	 *            location in the output array
-	 * @return the number of 32-bit words written (in compressed form)
-	 */
-	public static int compress(int[] in, int currentPos, int inlength, int out[], int tmpoutpos) {
-		int origtmpoutpos = tmpoutpos;
-		int finalpos = currentPos + inlength;
-		outer: while (currentPos < finalpos) {
-			mainloop: for (int selector = 0; selector < 8; selector++) {
-				int res = 0;
-				int compressedNum = codeNum[selector];
-				if (finalpos <= currentPos + compressedNum - 1)
-					compressedNum = finalpos - currentPos;
-				int b = bitLength[selector];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++) {
-					if (Util.smallerorequalthan(max, in[currentPos + i]))
-						continue mainloop;
-					res = (res << b) + in[currentPos + i];
-				}
-				if (compressedNum != codeNum[selector])
-					res <<= (codeNum[selector] - compressedNum) * b;
-				res |= selector << 28;
-				out[tmpoutpos++] = res;
-				currentPos += compressedNum;
-				continue outer;
-			}
-			final int selector = 8;
-			if (in[currentPos] >= 1 << bitLength[selector])
-				throw new RuntimeException("Too big a number");
-			out[tmpoutpos++] = in[currentPos++] | (selector << 28);
-		}
-		return tmpoutpos - origtmpoutpos;
-	}
+    /**
+     * Compress an integer array using Simple9
+     *
+     * 
+     * @param in
+     *            array to compress
+     * @param currentPos
+     *            where to start reading
+     * @param inlength
+     *            how many integers to read
+     * @param out
+     *            output array
+     * @param tmpoutpos
+     *            location in the output array
+     * @return the number of 32-bit words written (in compressed form)
+     */
+    public static int compress(int[] in, int currentPos, int inlength, int out[], int tmpoutpos) {
+        int origtmpoutpos = tmpoutpos;
+        int finalpos = currentPos + inlength;
+        outer: while (currentPos < finalpos) {
+            mainloop: for (int selector = 0; selector < 8; selector++) {
+                int res = 0;
+                int compressedNum = codeNum[selector];
+                if (finalpos <= currentPos + compressedNum - 1)
+                    compressedNum = finalpos - currentPos;
+                int b = bitLength[selector];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++) {
+                    if (Util.smallerorequalthan(max, in[currentPos + i]))
+                        continue mainloop;
+                    res = (res << b) + in[currentPos + i];
+                }
+                if (compressedNum != codeNum[selector])
+                    res <<= (codeNum[selector] - compressedNum) * b;
+                res |= selector << 28;
+                out[tmpoutpos++] = res;
+                currentPos += compressedNum;
+                continue outer;
+            }
+            final int selector = 8;
+            if (in[currentPos] >= 1 << bitLength[selector])
+                throw new RuntimeException("Too big a number");
+            out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+        }
+        return tmpoutpos - origtmpoutpos;
+    }
 
-	/**
-	 * Uncompressed data from an input array into an output array
-	 * 
-	 * @param in
-	 *            input array (in compressed form)
-	 * @param tmpinpos
-	 *            starting location in the compressed input array
-	 * @param inlength
-	 *            how much data we wish the read (in 32-bit words)
-	 * @param out
-	 *            output array (in decompressed form)
-	 * @param currentPos
-	 *            current position in the output array
-	 * @param outlength
-	 *            available data in the output array
-	 */
-	public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
-		int finallength = currentPos + outlength;
+    /**
+     * Uncompressed data from an input array into an output array
+     * 
+     * @param in
+     *            input array (in compressed form)
+     * @param tmpinpos
+     *            starting location in the compressed input array
+     * @param inlength
+     *            how much data we wish the read (in 32-bit words)
+     * @param out
+     *            output array (in decompressed form)
+     * @param currentPos
+     *            current position in the output array
+     * @param outlength
+     *            available data in the output array
+     */
+    public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
+        int finallength = currentPos + outlength;
 
-		while (currentPos < finallength) {
-			int val = in[tmpinpos++];
-			int header = val >>> 28;
-			switch (header) {
-			case 0: { // number : 28, bitwidth : 1
-				final int howmany = finallength - currentPos < 28 ? finallength - currentPos : 28;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (k + 4)) >>> 31;
-				}
-				break;
-			}
-			case 1: { // number : 14, bitwidth : 2
-				final int howmany = finallength - currentPos < 14 ? finallength - currentPos : 14;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (2 * k + 4)) >>> 30;
-				}
-				break;
-			}
-			case 2: { // number : 9, bitwidth : 3
-				final int howmany = finallength - currentPos < 9 ? finallength - currentPos : 9;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (3 * k + 5)) >>> 29;
-				}
-				break;
-			}
-			case 3: { // number : 7, bitwidth : 4
-				final int howmany = finallength - currentPos < 7 ? finallength - currentPos : 7;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (4 * k + 4)) >>> 28;
-				}
-				break;
-			}
-			case 4: { // number : 5, bitwidth : 5
-				final int howmany = finallength - currentPos < 5 ? finallength - currentPos : 5;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (5 * k + 7)) >>> 27;
-				}
-				break;
-			}
-			case 5: { // number : 4, bitwidth : 7
-				final int howmany = finallength - currentPos < 4 ? finallength - currentPos : 4;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (7 * k + 4)) >>> 25;
-				}
-				break;
-			}
-			case 6: { // number : 3, bitwidth : 9
-				final int howmany = finallength - currentPos < 3 ? finallength - currentPos : 3;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (9 * k + 5)) >>> 23;
-				}
-				break;
-			}
-			case 7: { // number : 2, bitwidth : 14
-				final int howmany = finallength - currentPos < 2 ? finallength - currentPos : 2;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (14 * k + 4)) >>> 18;
-				}
-				break;
-			}
-			case 8: { // number : 1, bitwidth : 28
-				out[currentPos++] = (val << 4) >>> 4;
-				break;
-			}
-			default: {
-				throw new RuntimeException("shouldn't happen");
-			}
-			}
-		}
+        while (currentPos < finallength) {
+            int val = in[tmpinpos++];
+            int header = val >>> 28;
+            switch (header) {
+            case 0: { // number : 28, bitwidth : 1
+                final int howmany = finallength - currentPos < 28 ? finallength - currentPos : 28;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (k + 4)) >>> 31;
+                }
+                break;
+            }
+            case 1: { // number : 14, bitwidth : 2
+                final int howmany = finallength - currentPos < 14 ? finallength - currentPos : 14;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+                }
+                break;
+            }
+            case 2: { // number : 9, bitwidth : 3
+                final int howmany = finallength - currentPos < 9 ? finallength - currentPos : 9;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+                }
+                break;
+            }
+            case 3: { // number : 7, bitwidth : 4
+                final int howmany = finallength - currentPos < 7 ? finallength - currentPos : 7;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+                }
+                break;
+            }
+            case 4: { // number : 5, bitwidth : 5
+                final int howmany = finallength - currentPos < 5 ? finallength - currentPos : 5;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+                }
+                break;
+            }
+            case 5: { // number : 4, bitwidth : 7
+                final int howmany = finallength - currentPos < 4 ? finallength - currentPos : 4;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+                }
+                break;
+            }
+            case 6: { // number : 3, bitwidth : 9
+                final int howmany = finallength - currentPos < 3 ? finallength - currentPos : 3;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+                }
+                break;
+            }
+            case 7: { // number : 2, bitwidth : 14
+                final int howmany = finallength - currentPos < 2 ? finallength - currentPos : 2;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+                }
+                break;
+            }
+            case 8: { // number : 1, bitwidth : 28
+                out[currentPos++] = (val << 4) >>> 4;
+                break;
+            }
+            default: {
+                throw new RuntimeException("shouldn't happen");
+            }
+            }
+        }
 
-	}
+    }
 
-	private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+    private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
 
-	private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+    private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
 
 }
diff --git a/src/main/java/me/lemire/integercompression/Simple16.java b/src/main/java/me/lemire/integercompression/Simple16.java
index e0f9d5a..2b7f27f 100644
--- a/src/main/java/me/lemire/integercompression/Simple16.java
+++ b/src/main/java/me/lemire/integercompression/Simple16.java
@@ -13,173 +13,179 @@
  */
 public final class Simple16 implements IntegerCODEC, SkippableIntegerCODEC {
 
-	public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
-		int i_inpos = inpos.get();
-		int i_outpos = outpos.get();
-		final int finalin = i_inpos + inlength;
-		while (i_inpos < finalin) {
-			int inoffset = compressblock(out, i_outpos++, in, i_inpos, inlength);
-			if (inoffset == -1)
-				throw new RuntimeException("Too big a number");
-			i_inpos += inoffset;
-			inlength -= inoffset;
-		}
-		inpos.set(i_inpos);
-		outpos.set(i_outpos);
-	}
-
-	/**
-	 * Compress an integer array using Simple16
-	 * 
-	 * @param out
-	 *            the compressed output
-	 * @param outOffset
-	 *            the offset of the output in the number of integers
-	 * @param in
-	 *            the integer input array
-	 * @param inOffset
-	 *            the offset of the input in the number of integers
-	 * @param n
-	 *            the number of elements to be compressed
-	 * @return the number of compressed integers
-	 */
-	public static final int compressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
-		int numIdx, j, num, bits;
-		for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
-			out[outOffset] = numIdx << S16_BITSSIZE;
-			num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
-
-			for (j = 0, bits = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
-				out[outOffset] |= (in[inOffset + j] << bits);
-				bits += S16_BITS[numIdx][j];
-				j++;
-			}
-
-			if (j == num) {
-				return num;
-			}
-		}
-
-		return -1;
-	}
-
-	/**
-	 * Decompress an integer array using Simple16
-	 * 
-	 * @param out
-	 *            the decompressed output
-	 * @param outOffset
-	 *            the offset of the output in the number of integers
-	 * @param in
-	 *            the compressed input array
-	 * @param inOffset
-	 *            the offset of the input in the number of integers
-	 * @param n
-	 *            the number of elements to be compressed
-	 * @return the number of processed integers
-	 */
-	public static final int decompressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
-		int numIdx, j = 0, bits = 0;
-		numIdx = in[inOffset] >>> S16_BITSSIZE;
-		int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n;
-		for (j = 0, bits = 0; j < num; j++) {
-			out[outOffset + j] = (in[inOffset] >>> bits) & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
-			bits += S16_BITS[numIdx][j];
-		}
-		return num;
-	}
-
-	@Override
-	public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
-		int i_inpos = inpos.get();
-		int i_outpos = outpos.get();
-		while (num > 0) {
-			final int howmany = decompressblock(out, i_outpos, in, i_inpos, num);
-			num -= howmany;
-			i_outpos += howmany;
-			i_inpos++;
-		}
-		inpos.set(i_inpos);
-		outpos.set(i_outpos);
-	}
-
-	/**
-	 * Uncompress data from an array to another array.
-	 * 
-	 * Both inpos and outpos parameters are modified to indicate new positions
-	 * after read/write.
-	 * 
-	 * @param in
-	 *            array containing data in compressed form
-	 * @param tmpinpos
-	 *            where to start reading in the array
-	 * @param inlength
-	 *            length of the compressed data (ignored by some schemes)
-	 * @param out
-	 *            array where to write the compressed output
-	 * @param currentPos
-	 *            where to write the compressed output in out
-	 * @param outlength
-	 *            number of integers we want to decode
-	 */
-	public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
-		final int finalpos = tmpinpos + inlength;
-		while (tmpinpos < finalpos) {
-			final int howmany = decompressblock(out, currentPos, in, tmpinpos, outlength);
-			outlength -= howmany;
-			currentPos += howmany;
-			tmpinpos += 1;
-		}
-
-	}
-
-	private static int[][] shiftme(int[][] x) {
-		int[][] answer = new int[x.length][];
-		for (int k = 0; k < x.length; ++k) {
-			answer[k] = new int[x[k].length];
-			for (int z = 0; z < answer[k].length; ++z)
-				answer[k][z] = 1 << x[k][z];
-		}
-		return answer;
-	}
-
-	@Override
-	public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
-		if (inlength == 0)
-			return;
-		out[outpos.get()] = inlength;
-		outpos.increment();
-		headlessCompress(in, inpos, inlength, out, outpos);
-	}
-
-	@Override
-	public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
-		if (inlength == 0)
-			return;
-		final int outlength = in[inpos.get()];
-		inpos.increment();
-		headlessUncompress(in, inpos, inlength, out, outpos, outlength);
-
-	}
-
-	@Override
-	public String toString() {
-		return this.getClass().getSimpleName();
-	}
-
-	private static final int S16_NUMSIZE = 16;
-	private static final int S16_BITSSIZE = 28;
-	// the possible number of bits used to represent one integer
-	private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1 };
-	// the corresponding number of elements for each value of the number of bits
-	private static final int[][] S16_BITS = {
-			{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-			{ 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-			{ 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 },
-			{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2 },
-			{ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
-			{ 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 }, { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
-			{ 7, 7, 7, 7 }, { 10, 9, 9, }, { 14, 14 }, { 28 } };
-	private static final int[][] SHIFTED_S16_BITS = shiftme(S16_BITS);
-
-}
\ No newline at end of file
+    public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
+        int i_inpos = inpos.get();
+        int i_outpos = outpos.get();
+        final int finalin = i_inpos + inlength;
+        while (i_inpos < finalin) {
+            int inoffset = compressblock(out, i_outpos++, in, i_inpos, inlength);
+            if (inoffset == -1)
+                throw new RuntimeException("Too big a number");
+            i_inpos += inoffset;
+            inlength -= inoffset;
+        }
+        inpos.set(i_inpos);
+        outpos.set(i_outpos);
+    }
+
+    /**
+     * Compress an integer array using Simple16
+     * 
+     * @param out
+     *            the compressed output
+     * @param outOffset
+     *            the offset of the output in the number of integers
+     * @param in
+     *            the integer input array
+     * @param inOffset
+     *            the offset of the input in the number of integers
+     * @param n
+     *            the number of elements to be compressed
+     * @return the number of compressed integers
+     */
+    public static final int compressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
+        int numIdx, j, num, bits;
+        for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
+            out[outOffset] = numIdx << S16_BITSSIZE;
+            num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
+
+            for (j = 0, bits = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
+                out[outOffset] |= (in[inOffset + j] << bits);
+                bits += S16_BITS[numIdx][j];
+                j++;
+            }
+
+            if (j == num) {
+                return num;
+            }
+        }
+
+        return -1;
+    }
+
+    /**
+     * Decompress an integer array using Simple16
+     * 
+     * @param out
+     *            the decompressed output
+     * @param outOffset
+     *            the offset of the output in the number of integers
+     * @param in
+     *            the compressed input array
+     * @param inOffset
+     *            the offset of the input in the number of integers
+     * @param n
+     *            the number of elements to be compressed
+     * @return the number of processed integers
+     */
+    public static final int decompressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
+        int numIdx, j = 0, bits = 0;
+        numIdx = in[inOffset] >>> S16_BITSSIZE;
+        int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n;
+        for (j = 0, bits = 0; j < num; j++) {
+            out[outOffset + j] = (in[inOffset] >>> bits) & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
+            bits += S16_BITS[numIdx][j];
+        }
+        return num;
+    }
+
+    @Override
+    public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
+        int i_inpos = inpos.get();
+        int i_outpos = outpos.get();
+        while (num > 0) {
+            final int howmany = decompressblock(out, i_outpos, in, i_inpos, num);
+            num -= howmany;
+            i_outpos += howmany;
+            i_inpos++;
+        }
+        inpos.set(i_inpos);
+        outpos.set(i_outpos);
+    }
+
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        compressedPositions.add(inlength);
+        return inlength;
+    }
+
+    /**
+     * Uncompress data from an array to another array.
+     * 
+     * Both inpos and outpos parameters are modified to indicate new positions
+     * after read/write.
+     * 
+     * @param in
+     *            array containing data in compressed form
+     * @param tmpinpos
+     *            where to start reading in the array
+     * @param inlength
+     *            length of the compressed data (ignored by some schemes)
+     * @param out
+     *            array where to write the compressed output
+     * @param currentPos
+     *            where to write the compressed output in out
+     * @param outlength
+     *            number of integers we want to decode
+     */
+    public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
+        final int finalpos = tmpinpos + inlength;
+        while (tmpinpos < finalpos) {
+            final int howmany = decompressblock(out, currentPos, in, tmpinpos, outlength);
+            outlength -= howmany;
+            currentPos += howmany;
+            tmpinpos += 1;
+        }
+
+    }
+
+    private static int[][] shiftme(int[][] x) {
+        int[][] answer = new int[x.length][];
+        for (int k = 0; k < x.length; ++k) {
+            answer[k] = new int[x[k].length];
+            for (int z = 0; z < answer[k].length; ++z)
+                answer[k][z] = 1 << x[k][z];
+        }
+        return answer;
+    }
+
+    @Override
+    public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        out[outpos.get()] = inlength;
+        outpos.increment();
+        headlessCompress(in, inpos, inlength, out, outpos);
+    }
+
+    @Override
+    public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        final int outlength = in[inpos.get()];
+        inpos.increment();
+        headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+
+    }
+
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
+
+    private static final int S16_NUMSIZE = 16;
+    private static final int S16_BITSSIZE = 28;
+    // the possible number of bits used to represent one integer
+    private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1 };
+    // the corresponding number of elements for each value of the number of bits
+    private static final int[][] S16_BITS = {
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 },
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2 },
+            { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
+            { 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 }, { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
+            { 7, 7, 7, 7 }, { 10, 9, 9, }, { 14, 14 }, { 28 } };
+    private static final int[][] SHIFTED_S16_BITS = shiftme(S16_BITS);
+
+}
diff --git a/src/main/java/me/lemire/integercompression/Simple9.java b/src/main/java/me/lemire/integercompression/Simple9.java
index 032489d..fd5194d 100644
--- a/src/main/java/me/lemire/integercompression/Simple9.java
+++ b/src/main/java/me/lemire/integercompression/Simple9.java
@@ -20,280 +20,286 @@
 public final class Simple9 implements IntegerCODEC, SkippableIntegerCODEC {
 
 
-	@Override
-	public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
-		int tmpoutpos = outpos.get();
-		int currentPos = inpos.get();
-		final int finalin = currentPos + inlength;
-		outer: while (currentPos < finalin - 28) {
-			mainloop: for (int selector = 0; selector < 8; selector++) {
+    @Override
+    public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
+        int tmpoutpos = outpos.get();
+        int currentPos = inpos.get();
+        final int finalin = currentPos + inlength;
+        outer: while (currentPos < finalin - 28) {
+            mainloop: for (int selector = 0; selector < 8; selector++) {
 
-				int res = 0;
-				int compressedNum = codeNum[selector];
-				int b = bitLength[selector];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++) {
-					if (max <= in[currentPos + i])
-						continue mainloop;
-					res = (res << b) + in[currentPos + i];
-				}
-				res |= selector << 28;
-				out[tmpoutpos++] = res;
-				currentPos += compressedNum;
-				continue outer;
-			}
-			final int selector = 8;
-			if (in[currentPos] >= 1 << bitLength[selector])
-				throw new RuntimeException("Too big a number");
-			out[tmpoutpos++] = in[currentPos++] | (selector << 28);
-		}
-		outer: while (currentPos < finalin) {
-			mainloop: for (int selector = 0; selector < 8; selector++) {
-				int res = 0;
-				int compressedNum = codeNum[selector];
-				if (finalin <= currentPos + compressedNum - 1)
-					compressedNum = finalin - currentPos;
-				int b = bitLength[selector];
-				int max = 1 << b;
-				int i = 0;
-				for (; i < compressedNum; i++) {
-					if (max <= in[currentPos + i])
-						continue mainloop;
-					res = (res << b) + in[currentPos + i];
-				}
+                int res = 0;
+                int compressedNum = codeNum[selector];
+                int b = bitLength[selector];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++) {
+                    if (max <= in[currentPos + i])
+                        continue mainloop;
+                    res = (res << b) + in[currentPos + i];
+                }
+                res |= selector << 28;
+                out[tmpoutpos++] = res;
+                currentPos += compressedNum;
+                continue outer;
+            }
+            final int selector = 8;
+            if (in[currentPos] >= 1 << bitLength[selector])
+                throw new RuntimeException("Too big a number");
+            out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+        }
+        outer: while (currentPos < finalin) {
+            mainloop: for (int selector = 0; selector < 8; selector++) {
+                int res = 0;
+                int compressedNum = codeNum[selector];
+                if (finalin <= currentPos + compressedNum - 1)
+                    compressedNum = finalin - currentPos;
+                int b = bitLength[selector];
+                int max = 1 << b;
+                int i = 0;
+                for (; i < compressedNum; i++) {
+                    if (max <= in[currentPos + i])
+                        continue mainloop;
+                    res = (res << b) + in[currentPos + i];
+                }
 
-				if (compressedNum != codeNum[selector])
-					res <<= (codeNum[selector] - compressedNum) * b;
-				res |= selector << 28;
-				out[tmpoutpos++] = res;
-				currentPos += compressedNum;
-				continue outer;
-			}
-			final int selector = 8;
-			if (in[currentPos] >= 1 << bitLength[selector])
-				throw new RuntimeException("Too big a number");
-			out[tmpoutpos++] = in[currentPos++] | (selector << 28);
-		}
-		inpos.set(currentPos);
-		outpos.set(tmpoutpos);
-	}
+                if (compressedNum != codeNum[selector])
+                    res <<= (codeNum[selector] - compressedNum) * b;
+                res |= selector << 28;
+                out[tmpoutpos++] = res;
+                currentPos += compressedNum;
+                continue outer;
+            }
+            final int selector = 8;
+            if (in[currentPos] >= 1 << bitLength[selector])
+                throw new RuntimeException("Too big a number");
+            out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+        }
+        inpos.set(currentPos);
+        outpos.set(tmpoutpos);
+    }
 
-	@Override
-	public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos,
-			int outlength) {
-		int currentPos = outpos.get();
-		int tmpinpos = inpos.get();
-		final int finalout = currentPos + outlength;
-		while (currentPos < finalout - 28) {
-			int val = in[tmpinpos++];
-			int header = val >>> 28;
-			switch (header) {
-			case 0: { // number : 28, bitwidth : 1
-				out[currentPos++] = (val << 4) >>> 31;
-				out[currentPos++] = (val << 5) >>> 31;
-				out[currentPos++] = (val << 6) >>> 31;
-				out[currentPos++] = (val << 7) >>> 31;
-				out[currentPos++] = (val << 8) >>> 31;
-				out[currentPos++] = (val << 9) >>> 31;
-				out[currentPos++] = (val << 10) >>> 31;
-				out[currentPos++] = (val << 11) >>> 31;
-				out[currentPos++] = (val << 12) >>> 31;
-				out[currentPos++] = (val << 13) >>> 31; // 10
-				out[currentPos++] = (val << 14) >>> 31;
-				out[currentPos++] = (val << 15) >>> 31;
-				out[currentPos++] = (val << 16) >>> 31;
-				out[currentPos++] = (val << 17) >>> 31;
-				out[currentPos++] = (val << 18) >>> 31;
-				out[currentPos++] = (val << 19) >>> 31;
-				out[currentPos++] = (val << 20) >>> 31;
-				out[currentPos++] = (val << 21) >>> 31;
-				out[currentPos++] = (val << 22) >>> 31;
-				out[currentPos++] = (val << 23) >>> 31; // 20
-				out[currentPos++] = (val << 24) >>> 31;
-				out[currentPos++] = (val << 25) >>> 31;
-				out[currentPos++] = (val << 26) >>> 31;
-				out[currentPos++] = (val << 27) >>> 31;
-				out[currentPos++] = (val << 28) >>> 31;
-				out[currentPos++] = (val << 29) >>> 31;
-				out[currentPos++] = (val << 30) >>> 31;
-				out[currentPos++] = (val << 31) >>> 31;
-				break;
-			}
-			case 1: { // number : 14, bitwidth : 2
-				out[currentPos++] = (val << 4) >>> 30;
-				out[currentPos++] = (val << 6) >>> 30;
-				out[currentPos++] = (val << 8) >>> 30;
-				out[currentPos++] = (val << 10) >>> 30;
-				out[currentPos++] = (val << 12) >>> 30;
-				out[currentPos++] = (val << 14) >>> 30;
-				out[currentPos++] = (val << 16) >>> 30;
-				out[currentPos++] = (val << 18) >>> 30;
-				out[currentPos++] = (val << 20) >>> 30;
-				out[currentPos++] = (val << 22) >>> 30; // 10
-				out[currentPos++] = (val << 24) >>> 30;
-				out[currentPos++] = (val << 26) >>> 30;
-				out[currentPos++] = (val << 28) >>> 30;
-				out[currentPos++] = (val << 30) >>> 30;
-				break;
-			}
-			case 2: { // number : 9, bitwidth : 3
-				out[currentPos++] = (val << 5) >>> 29;
-				out[currentPos++] = (val << 8) >>> 29;
-				out[currentPos++] = (val << 11) >>> 29;
-				out[currentPos++] = (val << 14) >>> 29;
-				out[currentPos++] = (val << 17) >>> 29;
-				out[currentPos++] = (val << 20) >>> 29;
-				out[currentPos++] = (val << 23) >>> 29;
-				out[currentPos++] = (val << 26) >>> 29;
-				out[currentPos++] = (val << 29) >>> 29;
-				break;
-			}
-			case 3: { // number : 7, bitwidth : 4
-				out[currentPos++] = (val << 4) >>> 28;
-				out[currentPos++] = (val << 8) >>> 28;
-				out[currentPos++] = (val << 12) >>> 28;
-				out[currentPos++] = (val << 16) >>> 28;
-				out[currentPos++] = (val << 20) >>> 28;
-				out[currentPos++] = (val << 24) >>> 28;
-				out[currentPos++] = (val << 28) >>> 28;
-				break;
-			}
-			case 4: { // number : 5, bitwidth : 5
-				out[currentPos++] = (val << 7) >>> 27;
-				out[currentPos++] = (val << 12) >>> 27;
-				out[currentPos++] = (val << 17) >>> 27;
-				out[currentPos++] = (val << 22) >>> 27;
-				out[currentPos++] = (val << 27) >>> 27;
-				break;
-			}
-			case 5: { // number : 4, bitwidth : 7
-				out[currentPos++] = (val << 4) >>> 25;
-				out[currentPos++] = (val << 11) >>> 25;
-				out[currentPos++] = (val << 18) >>> 25;
-				out[currentPos++] = (val << 25) >>> 25;
-				break;
-			}
-			case 6: { // number : 3, bitwidth : 9
-				out[currentPos++] = (val << 5) >>> 23;
-				out[currentPos++] = (val << 14) >>> 23;
-				out[currentPos++] = (val << 23) >>> 23;
-				break;
-			}
-			case 7: { // number : 2, bitwidth : 14
-				out[currentPos++] = (val << 4) >>> 18;
-				out[currentPos++] = (val << 18) >>> 18;
-				break;
-			}
-			case 8: { // number : 1, bitwidth : 28
-				out[currentPos++] = (val << 4) >>> 4;
-				break;
-			}
-			default: {
-				throw new RuntimeException("shouldn't happen: limited to 28-bit integers");
-			}
-			}
-		}
-		while (currentPos < finalout) {
-			int val = in[tmpinpos++];
-			int header = val >>> 28;
-			switch (header) {
-			case 0: { // number : 28, bitwidth : 1
-				final int howmany = finalout - currentPos;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (k + 4)) >>> 31;
-				}
-				break;
-			}
-			case 1: { // number : 14, bitwidth : 2
-				final int howmany = finalout - currentPos < 14 ? finalout - currentPos : 14;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (2 * k + 4)) >>> 30;
-				}
-				break;
-			}
-			case 2: { // number : 9, bitwidth : 3
-				final int howmany = finalout - currentPos < 9 ? finalout - currentPos : 9;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (3 * k + 5)) >>> 29;
-				}
-				break;
-			}
-			case 3: { // number : 7, bitwidth : 4
-				final int howmany = finalout - currentPos < 7 ? finalout - currentPos : 7;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (4 * k + 4)) >>> 28;
-				}
-				break;
-			}
-			case 4: { // number : 5, bitwidth : 5
-				final int howmany = finalout - currentPos < 5 ? finalout - currentPos : 5;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (5 * k + 7)) >>> 27;
-				}
-				break;
-			}
-			case 5: { // number : 4, bitwidth : 7
-				final int howmany = finalout - currentPos < 4 ? finalout - currentPos : 4;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (7 * k + 4)) >>> 25;
-				}
-				break;
-			}
-			case 6: { // number : 3, bitwidth : 9
-				final int howmany = finalout - currentPos < 3 ? finalout - currentPos : 3;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (9 * k + 5)) >>> 23;
-				}
-				break;
-			}
-			case 7: { // number : 2, bitwidth : 14
-				final int howmany = finalout - currentPos < 2 ? finalout - currentPos : 2;
-				for (int k = 0; k < howmany; ++k) {
-					out[currentPos++] = (val << (14 * k + 4)) >>> 18;
-				}
-				break;
-			}
-			case 8: { // number : 1, bitwidth : 28
-				out[currentPos++] = (val << 4) >>> 4;
-				break;
-			}
-			default: {
-				throw new RuntimeException("shouldn't happen");
-			}
-			}
-		}
-		outpos.set(currentPos);
-		inpos.set(tmpinpos);
+    @Override
+    public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos,
+            int outlength) {
+        int currentPos = outpos.get();
+        int tmpinpos = inpos.get();
+        final int finalout = currentPos + outlength;
+        while (currentPos < finalout - 28) {
+            int val = in[tmpinpos++];
+            int header = val >>> 28;
+            switch (header) {
+            case 0: { // number : 28, bitwidth : 1
+                out[currentPos++] = (val << 4) >>> 31;
+                out[currentPos++] = (val << 5) >>> 31;
+                out[currentPos++] = (val << 6) >>> 31;
+                out[currentPos++] = (val << 7) >>> 31;
+                out[currentPos++] = (val << 8) >>> 31;
+                out[currentPos++] = (val << 9) >>> 31;
+                out[currentPos++] = (val << 10) >>> 31;
+                out[currentPos++] = (val << 11) >>> 31;
+                out[currentPos++] = (val << 12) >>> 31;
+                out[currentPos++] = (val << 13) >>> 31; // 10
+                out[currentPos++] = (val << 14) >>> 31;
+                out[currentPos++] = (val << 15) >>> 31;
+                out[currentPos++] = (val << 16) >>> 31;
+                out[currentPos++] = (val << 17) >>> 31;
+                out[currentPos++] = (val << 18) >>> 31;
+                out[currentPos++] = (val << 19) >>> 31;
+                out[currentPos++] = (val << 20) >>> 31;
+                out[currentPos++] = (val << 21) >>> 31;
+                out[currentPos++] = (val << 22) >>> 31;
+                out[currentPos++] = (val << 23) >>> 31; // 20
+                out[currentPos++] = (val << 24) >>> 31;
+                out[currentPos++] = (val << 25) >>> 31;
+                out[currentPos++] = (val << 26) >>> 31;
+                out[currentPos++] = (val << 27) >>> 31;
+                out[currentPos++] = (val << 28) >>> 31;
+                out[currentPos++] = (val << 29) >>> 31;
+                out[currentPos++] = (val << 30) >>> 31;
+                out[currentPos++] = (val << 31) >>> 31;
+                break;
+            }
+            case 1: { // number : 14, bitwidth : 2
+                out[currentPos++] = (val << 4) >>> 30;
+                out[currentPos++] = (val << 6) >>> 30;
+                out[currentPos++] = (val << 8) >>> 30;
+                out[currentPos++] = (val << 10) >>> 30;
+                out[currentPos++] = (val << 12) >>> 30;
+                out[currentPos++] = (val << 14) >>> 30;
+                out[currentPos++] = (val << 16) >>> 30;
+                out[currentPos++] = (val << 18) >>> 30;
+                out[currentPos++] = (val << 20) >>> 30;
+                out[currentPos++] = (val << 22) >>> 30; // 10
+                out[currentPos++] = (val << 24) >>> 30;
+                out[currentPos++] = (val << 26) >>> 30;
+                out[currentPos++] = (val << 28) >>> 30;
+                out[currentPos++] = (val << 30) >>> 30;
+                break;
+            }
+            case 2: { // number : 9, bitwidth : 3
+                out[currentPos++] = (val << 5) >>> 29;
+                out[currentPos++] = (val << 8) >>> 29;
+                out[currentPos++] = (val << 11) >>> 29;
+                out[currentPos++] = (val << 14) >>> 29;
+                out[currentPos++] = (val << 17) >>> 29;
+                out[currentPos++] = (val << 20) >>> 29;
+                out[currentPos++] = (val << 23) >>> 29;
+                out[currentPos++] = (val << 26) >>> 29;
+                out[currentPos++] = (val << 29) >>> 29;
+                break;
+            }
+            case 3: { // number : 7, bitwidth : 4
+                out[currentPos++] = (val << 4) >>> 28;
+                out[currentPos++] = (val << 8) >>> 28;
+                out[currentPos++] = (val << 12) >>> 28;
+                out[currentPos++] = (val << 16) >>> 28;
+                out[currentPos++] = (val << 20) >>> 28;
+                out[currentPos++] = (val << 24) >>> 28;
+                out[currentPos++] = (val << 28) >>> 28;
+                break;
+            }
+            case 4: { // number : 5, bitwidth : 5
+                out[currentPos++] = (val << 7) >>> 27;
+                out[currentPos++] = (val << 12) >>> 27;
+                out[currentPos++] = (val << 17) >>> 27;
+                out[currentPos++] = (val << 22) >>> 27;
+                out[currentPos++] = (val << 27) >>> 27;
+                break;
+            }
+            case 5: { // number : 4, bitwidth : 7
+                out[currentPos++] = (val << 4) >>> 25;
+                out[currentPos++] = (val << 11) >>> 25;
+                out[currentPos++] = (val << 18) >>> 25;
+                out[currentPos++] = (val << 25) >>> 25;
+                break;
+            }
+            case 6: { // number : 3, bitwidth : 9
+                out[currentPos++] = (val << 5) >>> 23;
+                out[currentPos++] = (val << 14) >>> 23;
+                out[currentPos++] = (val << 23) >>> 23;
+                break;
+            }
+            case 7: { // number : 2, bitwidth : 14
+                out[currentPos++] = (val << 4) >>> 18;
+                out[currentPos++] = (val << 18) >>> 18;
+                break;
+            }
+            case 8: { // number : 1, bitwidth : 28
+                out[currentPos++] = (val << 4) >>> 4;
+                break;
+            }
+            default: {
+                throw new RuntimeException("shouldn't happen: limited to 28-bit integers");
+            }
+            }
+        }
+        while (currentPos < finalout) {
+            int val = in[tmpinpos++];
+            int header = val >>> 28;
+            switch (header) {
+            case 0: { // number : 28, bitwidth : 1
+                final int howmany = finalout - currentPos;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (k + 4)) >>> 31;
+                }
+                break;
+            }
+            case 1: { // number : 14, bitwidth : 2
+                final int howmany = finalout - currentPos < 14 ? finalout - currentPos : 14;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+                }
+                break;
+            }
+            case 2: { // number : 9, bitwidth : 3
+                final int howmany = finalout - currentPos < 9 ? finalout - currentPos : 9;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+                }
+                break;
+            }
+            case 3: { // number : 7, bitwidth : 4
+                final int howmany = finalout - currentPos < 7 ? finalout - currentPos : 7;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+                }
+                break;
+            }
+            case 4: { // number : 5, bitwidth : 5
+                final int howmany = finalout - currentPos < 5 ? finalout - currentPos : 5;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+                }
+                break;
+            }
+            case 5: { // number : 4, bitwidth : 7
+                final int howmany = finalout - currentPos < 4 ? finalout - currentPos : 4;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+                }
+                break;
+            }
+            case 6: { // number : 3, bitwidth : 9
+                final int howmany = finalout - currentPos < 3 ? finalout - currentPos : 3;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+                }
+                break;
+            }
+            case 7: { // number : 2, bitwidth : 14
+                final int howmany = finalout - currentPos < 2 ? finalout - currentPos : 2;
+                for (int k = 0; k < howmany; ++k) {
+                    out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+                }
+                break;
+            }
+            case 8: { // number : 1, bitwidth : 28
+                out[currentPos++] = (val << 4) >>> 4;
+                break;
+            }
+            default: {
+                throw new RuntimeException("shouldn't happen");
+            }
+            }
+        }
+        outpos.set(currentPos);
+        inpos.set(tmpinpos);
 
-	}
+    }
 
-	@Override
-	public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
-		if (inlength == 0)
-			return;
-		out[outpos.get()] = inlength;
-		outpos.increment();
-		headlessCompress(in, inpos, inlength, out, outpos);
-	}
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        compressedPositions.add(inlength);
+        return inlength;
+    }
 
-	@Override
-	public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
-		if (inlength == 0)
-			return;
-		final int outlength = in[inpos.get()];
-		inpos.increment();
-		headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+    @Override
+    public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        out[outpos.get()] = inlength;
+        outpos.increment();
+        headlessCompress(in, inpos, inlength, out, outpos);
+    }
 
-	}
+    @Override
+    public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        final int outlength = in[inpos.get()];
+        inpos.increment();
+        headlessUncompress(in, inpos, inlength, out, outpos, outlength);
 
-	private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+    }
 
-	private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+    private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
 
-	@Override
-	public String toString() {
-		return this.getClass().getSimpleName();
-	}
+    private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
 
 }
diff --git a/src/main/java/me/lemire/integercompression/SkippableComposition.java b/src/main/java/me/lemire/integercompression/SkippableComposition.java
index a235c47..fc3c18e 100644
--- a/src/main/java/me/lemire/integercompression/SkippableComposition.java
+++ b/src/main/java/me/lemire/integercompression/SkippableComposition.java
@@ -52,15 +52,27 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
     public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
             IntWrapper outpos, int num) {
         int init = inpos.get();
+        int outposInit = outpos.get();
+
         F1.headlessUncompress(in, inpos, inlength, out, outpos, num);
         if (inpos.get() == init) {
-        	  inpos.increment();
+              inpos.increment();
         }
         inlength -= inpos.get() - init;
-        num -= outpos.get();
+        num -= outpos.get() - outposInit;
         F2.headlessUncompress(in, inpos, inlength, out, outpos, num);
     }
 
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int init = compressedPositions.get();
+        int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+        maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+        inlength -= compressedPositions.get() - init;
+        maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+        return maxLength;
+    }
+
     @Override
     public String toString() {
         return F1.toString() + "+" + F2.toString();
diff --git a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
index c10d2f0..b9bdc04 100644
--- a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
@@ -10,10 +10,11 @@
 
 /**
  * Interface describing a standard CODEC to compress integers. This is a
- * variation on the IntegerCODEC interface meant to be used for random access.
+ * variation on the IntegerCODEC interface meant to be used for random access
+ * (i.e., given a large array, you can segment it and decode just the subarray you need).
  * 
- * The main difference is that we must specify the number of integers we wish to
- * decode. This information should be stored elsewhere.
+ * The main difference is that you must specify the number of integers you wish to
+ * uncompress. This information should be stored elsewhere.
  * 
  * This interface was designed by the Terrier team for their search engine.
  * 
@@ -25,14 +26,17 @@ public interface SkippableIntegerCODEC {
      * Compress data from an array to another array.
      * 
      * Both inpos and outpos are modified to represent how much data was read
-     * and written to if 12 ints (inlength = 12) are compressed to 3 ints, then
-     * inpos will be incremented by 12 while outpos will be incremented by 3 we
+     * and written to. If 12 ints (inlength = 12) are compressed to 3 ints, then
+     * inpos will be incremented by 12 while outpos will be incremented by 3. We
      * use IntWrapper to pass the values by reference.
      * 
+     * Implementation note: contrary to {@link IntegerCODEC#compress},
+     * this may skip writing information about the number of encoded integers.
+     * 
      * @param in
      *            input array
      * @param inpos
-     *            location in the input array
+     *            where to start reading in the array
      * @param inlength
      *            how many integers to compress
      * @param out
@@ -56,13 +60,30 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
      * @param inlength
      *            length of the compressed data (ignored by some schemes)
      * @param out
-     *            array where to write the compressed output
+     *            array where to write the uncompressed output
      * @param outpos
-     *            where to write the compressed output in out
+     *            where to start writing the uncompressed output in out
      * @param num
-     *            number of integers we want to decode, the actual number of integers decoded can be less
+     *            number of integers we want to decode. May be less than the actual number of compressed integers
      */
     public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
             IntWrapper outpos, int num);
 
+    /**
+     * Compute the maximum number of integers that might be required to store
+     * the compressed form of a given input array segment, without headers.
+     * <p>
+     * This is useful to pre-allocate the output buffer before calling
+     * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper)}.
+     * </p>
+     *
+     * @param compressedPositions
+     *        since not all schemes compress every input integer, this parameter
+     *        returns how many input integers will actually be compressed.
+     *        This is useful when composing multiple schemes.
+     * @param inlength
+     *            number of integers to be compressed
+     * @return the maximum number of integers needed in the output array
+     */
+    int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
 }
diff --git a/src/main/java/me/lemire/integercompression/UncompressibleInputException.java b/src/main/java/me/lemire/integercompression/UncompressibleInputException.java
deleted file mode 100644
index c490946..0000000
--- a/src/main/java/me/lemire/integercompression/UncompressibleInputException.java
+++ /dev/null
@@ -1,19 +0,0 @@
-package me.lemire.integercompression;
-
-/**
- * This exception might be thrown if the input is poorly compressible.
- *
- */
-public class UncompressibleInputException extends RuntimeException {
-
-	/**
-	 * Create new exception
-	 * @param string explanation for the exception
-	 */
-	public UncompressibleInputException(String string) {
-		super(string);
-	}
-
-	private static final long serialVersionUID = -798583799846489873L;
-
-}
diff --git a/src/main/java/me/lemire/integercompression/Util.java b/src/main/java/me/lemire/integercompression/Util.java
index 346e3b2..63fc918 100644
--- a/src/main/java/me/lemire/integercompression/Util.java
+++ b/src/main/java/me/lemire/integercompression/Util.java
@@ -15,13 +15,13 @@
 public final class Util {
    
 
-	
-	// check whether x is small than y as unsigned ints (supported by Java 8 natively);
-	protected static final boolean smallerorequalthan(int x, int y) {
-		return (x + Integer.MIN_VALUE) <= (y + Integer.MIN_VALUE);
-	}
-	
-	/**
+    
+    // check whether x is small than y as unsigned ints (supported by Java 8 natively);
+    protected static final boolean smallerorequalthan(int x, int y) {
+        return (x + Integer.MIN_VALUE) <= (y + Integer.MIN_VALUE);
+    }
+    
+    /**
      * Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range
      * of value
      * 
diff --git a/src/main/java/me/lemire/integercompression/VariableByte.java b/src/main/java/me/lemire/integercompression/VariableByte.java
index 5b25c43..c9b04d0 100644
--- a/src/main/java/me/lemire/integercompression/VariableByte.java
+++ b/src/main/java/me/lemire/integercompression/VariableByte.java
@@ -21,6 +21,8 @@
  */
 public class VariableByte implements IntegerCODEC, ByteIntegerCODEC, SkippableIntegerCODEC {
 
+    private static final int MAX_BYTES_PER_INT = 5;
+
     private static byte extract7bits(int i, long val) {
         return (byte) ((val >> (7 * i)) & ((1 << 7) - 1));
     }
@@ -122,8 +124,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
         for (int v = 0, shift = 0; p < finalp;) {
             val = in[p];
             int c = (byte) (val >>> s);
+            // Shift to next byte
             s += 8;
+            // Shift to next integer if s==32
             p += s>>5;
+            // cycle from 31 to 0
             s = s & 31;
             v += ((c & 127) << shift);
             if ((c & 128) == 128) {
@@ -187,8 +192,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
         for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
             val = in[p];
             int c = val >>> s;
+            // Shift to next byte
             s += 8;
+            // Shift to next integer if s==32
             p += s>>5;
+            // cycle from 31 to 0
             s = s & 31;
             v += ((c & 127) << shift);
             if ((c & 128) == 128) {
@@ -202,12 +210,23 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
         inpos.set(p + (s!=0 ? 1 : 0));
     }
 
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+        int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES;
+        compressedPositions.add(inlength);
+        return maxLengthInInts;
+    }
+
     /**
      * Creates a new buffer of the requested size.
      *
      * In case you need a different way to allocate buffers, you can override this method
      * with a custom behavior. The default implementation allocates a new Java direct
      * {@link ByteBuffer} on each invocation.
+     * 
+     * @param sizeInBytes
+     * @return
      */
     protected ByteBuffer makeBuffer(int sizeInBytes) {
         return ByteBuffer.allocateDirect(sizeInBytes);
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
index c5fee69..ef4a386 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
@@ -308,10 +308,10 @@ private static void testByteCodec(PrintWriter csvLog, int sparsity,
         public static void main(String args[]) throws FileNotFoundException  {
                 System.out
                         .println("# benchmark based on the ClusterData model from:");
-                System.out.println("# 	 Vo Ngoc Anh and Alistair Moffat. ");
-                System.out.println("#	 Index compression using 64-bit words.");
+                System.out.println("#      Vo Ngoc Anh and Alistair Moffat. ");
+                System.out.println("#     Index compression using 64-bit words.");
                 System.out
-                        .println("# 	 Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
+                        .println("#      Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
                 System.out.println();
 
                 PrintWriter writer = null;
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
index 58bbc4a..b930568 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
@@ -83,7 +83,6 @@ private static int decompressFromSkipTable(Object c, int[] compressed,
             if (num > length - uncomppos.get())
                 num = length - uncomppos.get();
             int location = metadata[metapos++];
-            // System.out.println("location = "+location);
             int initvalue = metadata[metapos++];
             int outputlocation = uncomppos.get();
             if (location != compressedpos.get())
@@ -242,10 +241,10 @@ private static void testCodec(PrintWriter csvLog, int sparsity, Object c,
      */
     public static void main(String args[]) throws FileNotFoundException {
         System.out.println("# benchmark based on the ClusterData model from:");
-        System.out.println("# 	 Vo Ngoc Anh and Alistair Moffat. ");
-        System.out.println("#	 Index compression using 64-bit words.");
+        System.out.println("#      Vo Ngoc Anh and Alistair Moffat. ");
+        System.out.println("#     Index compression using 64-bit words.");
         System.out
-                .println("# 	 Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
+                .println("#      Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
         System.out.println();
 
         PrintWriter writer = null;
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
index 7e1c161..f50a367 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
@@ -49,7 +49,8 @@
 public class IntegratedBinaryPacking implements IntegratedIntegerCODEC,
         SkippableIntegratedIntegerCODEC {
 
-    static final int BLOCK_SIZE = 32;
+    public static final int BLOCK_SIZE = 32;
+    private static final int MAX_BIT_WIDTH = Integer.SIZE;
 
     @Override
     public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
@@ -170,4 +171,13 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
         initvalue.set(initoffset);
         inpos.set(tmpinpos);
     }
+
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int blockCount = inlength / BLOCK_SIZE;
+        int headersSizeInInts = blockCount / Integer.BYTES + (blockCount % Integer.BYTES);
+        int blocksSizeInInts = blockCount * MAX_BIT_WIDTH;
+        compressedPositions.add(blockCount * BLOCK_SIZE);
+        return headersSizeInInts + blocksSizeInInts;
+    }
 }
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
index 5808bdd..1d935c4 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
@@ -3,7 +3,6 @@
 import java.util.Arrays;
 
 import me.lemire.integercompression.IntWrapper;
-import me.lemire.integercompression.UncompressibleInputException;
 
 /**
  * This is a convenience class that wraps a codec to provide
@@ -36,19 +35,14 @@ public IntegratedIntCompressor() {
      * 
      * @param input array to be compressed
      * @return compressed array
-     * @throws UncompressibleInputException if the data is too poorly compressible
      */
     public  int[] compress(int[] input) {
-        int [] compressed = new int[input.length + input.length / 100 + 1024];
+        int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length);
+        int [] compressed = new int[maxCompressedLength + 1]; // +1 to store the length of the input
         compressed[0] = input.length;
         IntWrapper outpos = new IntWrapper(1);
         IntWrapper initvalue = new IntWrapper(0);
-		try {
-			codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos, initvalue);
-		} catch (IndexOutOfBoundsException ioebe) {
-			throw new UncompressibleInputException(
-					"Your input is too poorly compressible with the current codec : " + codec);
-		}
+        codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos, initvalue);
         compressed = Arrays.copyOf(compressed,outpos.intValue());
         return compressed;
     }
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
index 918a900..a577031 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
@@ -24,6 +24,8 @@
 public class IntegratedVariableByte implements IntegratedIntegerCODEC, IntegratedByteIntegerCODEC,
 SkippableIntegratedIntegerCODEC  {
 
+    private static final int MAX_BYTES_PER_INT = 5;
+
     private static byte extract7bits(int i, long val) {
         return (byte)((val >> (7 * i)) & ((1 << 7) - 1));
     }
@@ -257,6 +259,14 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
         inpos.set(p + (s!=0 ? 1 : 0));        
     }
 
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+        int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES;
+        compressedPositions.add(inlength);
+        return maxLengthInInts;
+    }
+
     /**
      * Creates a new buffer of the requested size.
      *
diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
index 09c4dd8..4786ec5 100644
--- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
+++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
@@ -66,14 +66,25 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
         if (inlength == 0)
             return;
         int init = inpos.get();
+        int outposInit = outpos.get();
+
         F1.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue);
         if (inpos.get() == init) {
-      	  inpos.increment();
+            inpos.increment();
         }
         inlength -= inpos.get() - init;
 
-        num -= outpos.get();
+        num -= outpos.get() - outposInit;
         F2.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue);
     }
 
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int init = compressedPositions.get();
+        int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+        maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+        inlength -= compressedPositions.get() - init;
+        maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+        return maxLength;
+    }
 }
diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
index 8b7fd4b..e2df754 100644
--- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
@@ -71,4 +71,21 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
     public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
             IntWrapper outpos, int num, IntWrapper initvalue);
 
+    /**
+     * Compute the maximum number of integers that might be required to store
+     * the compressed form of a given input array segment, without headers.
+     * <p>
+     * This is useful to pre-allocate the output buffer before calling
+     * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper, IntWrapper)}.
+     * </p>
+     *
+     * @param compressedPositions
+     *        since not all schemes compress every input integer, this parameter
+     *        returns how many input integers will actually be compressed.
+     *        This is useful when composing multiple schemes.
+     * @param inlength
+     *            number of integers to be compressed
+     * @return the maximum number of integers needed in the output array
+     */
+    int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
 }
diff --git a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
index bbd386a..a50497c 100644
--- a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
+++ b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
@@ -42,7 +42,7 @@ int[] generateUniformHash(int N, int Max) {
                 int[] ans = new int[N];
                 HashSet<Integer> s = new HashSet<Integer>();
                 while (s.size() < N)
-                        s.add(new Integer(this.rand.nextInt(Max)));
+                        s.add(this.rand.nextInt(Max));
                 Iterator<Integer> i = s.iterator();
                 for (int k = 0; k < N; ++k)
                         ans[k] = i.next().intValue();
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java
new file mode 100644
index 0000000..9b2e1ca
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java
@@ -0,0 +1,12790 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+package me.lemire.integercompression.vector;
+
+import java.util.Arrays;
+import jdk.incubator.vector.*;
+
+/**
+ * Vectorized bitpacking routines. This class is a version of the
+ * VectorBitPackerTerse class that with less branch instructions. 
+ *
+ * The code is machine generated from VectorBitPackerTerse.java using helper
+ * classes.
+ *
+ */
+public class VectorBitPacker {
+  private static final VectorSpecies<Integer> SPECIES_512 =
+      IntVector.SPECIES_512;
+  private static final VectorSpecies<Integer> SPECIES_256 =
+      IntVector.SPECIES_256;
+  private static final int VLEN_512 = 16;
+  private static final int VLEN_256 = 8;
+  private static final int BLOCK_SIZE = 256;
+
+  private static final IntVector MASK_1 =
+      IntVector.broadcast(SPECIES_256, (1 << 1) - 1);
+  private static final IntVector MASK_2 =
+      IntVector.broadcast(SPECIES_512, (1 << 2) - 1);
+  private static final IntVector MASK_3 =
+      IntVector.broadcast(SPECIES_256, (1 << 3) - 1);
+  private static final IntVector MASK_4 =
+      IntVector.broadcast(SPECIES_512, (1 << 4) - 1);
+  private static final IntVector MASK_5 =
+      IntVector.broadcast(SPECIES_256, (1 << 5) - 1);
+  private static final IntVector MASK_6 =
+      IntVector.broadcast(SPECIES_512, (1 << 6) - 1);
+  private static final IntVector MASK_7 =
+      IntVector.broadcast(SPECIES_256, (1 << 7) - 1);
+  private static final IntVector MASK_8 =
+      IntVector.broadcast(SPECIES_512, (1 << 8) - 1);
+  private static final IntVector MASK_9 =
+      IntVector.broadcast(SPECIES_256, (1 << 9) - 1);
+  private static final IntVector MASK_10 =
+      IntVector.broadcast(SPECIES_512, (1 << 10) - 1);
+  private static final IntVector MASK_11 =
+      IntVector.broadcast(SPECIES_256, (1 << 11) - 1);
+  private static final IntVector MASK_12 =
+      IntVector.broadcast(SPECIES_512, (1 << 12) - 1);
+  private static final IntVector MASK_13 =
+      IntVector.broadcast(SPECIES_256, (1 << 13) - 1);
+  private static final IntVector MASK_14 =
+      IntVector.broadcast(SPECIES_512, (1 << 14) - 1);
+  private static final IntVector MASK_15 =
+      IntVector.broadcast(SPECIES_256, (1 << 15) - 1);
+  private static final IntVector MASK_16 =
+      IntVector.broadcast(SPECIES_512, (1 << 16) - 1);
+  private static final IntVector MASK_17 =
+      IntVector.broadcast(SPECIES_256, (1 << 17) - 1);
+  private static final IntVector MASK_18 =
+      IntVector.broadcast(SPECIES_512, (1 << 18) - 1);
+  private static final IntVector MASK_19 =
+      IntVector.broadcast(SPECIES_256, (1 << 19) - 1);
+  private static final IntVector MASK_20 =
+      IntVector.broadcast(SPECIES_512, (1 << 20) - 1);
+  private static final IntVector MASK_21 =
+      IntVector.broadcast(SPECIES_256, (1 << 21) - 1);
+  private static final IntVector MASK_22 =
+      IntVector.broadcast(SPECIES_512, (1 << 22) - 1);
+  private static final IntVector MASK_23 =
+      IntVector.broadcast(SPECIES_256, (1 << 23) - 1);
+  private static final IntVector MASK_24 =
+      IntVector.broadcast(SPECIES_512, (1 << 24) - 1);
+  private static final IntVector MASK_25 =
+      IntVector.broadcast(SPECIES_256, (1 << 25) - 1);
+  private static final IntVector MASK_26 =
+      IntVector.broadcast(SPECIES_512, (1 << 26) - 1);
+  private static final IntVector MASK_27 =
+      IntVector.broadcast(SPECIES_256, (1 << 27) - 1);
+  private static final IntVector MASK_28 =
+      IntVector.broadcast(SPECIES_512, (1 << 28) - 1);
+  private static final IntVector MASK_29 =
+      IntVector.broadcast(SPECIES_256, (1 << 29) - 1);
+  private static final IntVector MASK_30 =
+      IntVector.broadcast(SPECIES_512, (1 << 30) - 1);
+  private static final IntVector MASK_31 =
+      IntVector.broadcast(SPECIES_256, (1 << 31) - 1);
+
+  /**
+   * Pack 32 integers
+   *
+   * @param in
+   *                source array
+   * @param inpos
+   *                position in source array
+   * @param out
+   *                output array
+   * @param outpos
+   *                position in output array
+   * @param b
+   *                number of bits to use per integer
+   */
+  public static void fastpack(final int[] in, int inpos, final int[] out,
+                              int outpos, int b) {
+    switch (b) {
+    case 0:
+      break;
+    case 1:
+      fastpack1(in, inpos, out, outpos);
+      break;
+    case 2:
+      fastpack2(in, inpos, out, outpos);
+      break;
+    case 3:
+      fastpack3(in, inpos, out, outpos);
+      break;
+    case 4:
+      fastpack4(in, inpos, out, outpos);
+      break;
+    case 5:
+      fastpack5(in, inpos, out, outpos);
+      break;
+    case 6:
+      fastpack6(in, inpos, out, outpos);
+      break;
+    case 7:
+      fastpack7(in, inpos, out, outpos);
+      break;
+    case 8:
+      fastpack8(in, inpos, out, outpos);
+      break;
+    case 9:
+      fastpack9(in, inpos, out, outpos);
+      break;
+    case 10:
+      fastpack10(in, inpos, out, outpos);
+      break;
+    case 11:
+      fastpack11(in, inpos, out, outpos);
+      break;
+    case 12:
+      fastpack12(in, inpos, out, outpos);
+      break;
+    case 13:
+      fastpack13(in, inpos, out, outpos);
+      break;
+    case 14:
+      fastpack14(in, inpos, out, outpos);
+      break;
+    case 15:
+      fastpack15(in, inpos, out, outpos);
+      break;
+    case 16:
+      fastpack16(in, inpos, out, outpos);
+      break;
+    case 17:
+      fastpack17(in, inpos, out, outpos);
+      break;
+    case 18:
+      fastpack18(in, inpos, out, outpos);
+      break;
+    case 19:
+      fastpack19(in, inpos, out, outpos);
+      break;
+    case 20:
+      fastpack20(in, inpos, out, outpos);
+      break;
+    case 21:
+      fastpack21(in, inpos, out, outpos);
+      break;
+    case 22:
+      fastpack22(in, inpos, out, outpos);
+      break;
+    case 23:
+      fastpack23(in, inpos, out, outpos);
+      break;
+    case 24:
+      fastpack24(in, inpos, out, outpos);
+      break;
+    case 25:
+      fastpack25(in, inpos, out, outpos);
+      break;
+    case 26:
+      fastpack26(in, inpos, out, outpos);
+      break;
+    case 27:
+      fastpack27(in, inpos, out, outpos);
+      break;
+    case 28:
+      fastpack28(in, inpos, out, outpos);
+      break;
+    case 29:
+      fastpack29(in, inpos, out, outpos);
+      break;
+    case 30:
+      fastpack30(in, inpos, out, outpos);
+      break;
+    case 31:
+      fastpack31(in, inpos, out, outpos);
+      break;
+    case 32:
+      System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+      break;
+    }
+  }
+
+  static void fastpackNoMask(final int[] in, int inpos, final int[] out,
+                             int outpos, int b) {
+    switch (b) {
+    case 0:
+      break;
+    case 1:
+      fastpackNoMask1(in, inpos, out, outpos);
+      break;
+    case 2:
+      fastpackNoMask2(in, inpos, out, outpos);
+      break;
+    case 3:
+      fastpackNoMask3(in, inpos, out, outpos);
+      break;
+    case 4:
+      fastpackNoMask4(in, inpos, out, outpos);
+      break;
+    case 5:
+      fastpackNoMask5(in, inpos, out, outpos);
+      break;
+    case 6:
+      fastpackNoMask6(in, inpos, out, outpos);
+      break;
+    case 7:
+      fastpackNoMask7(in, inpos, out, outpos);
+      break;
+    case 8:
+      fastpackNoMask8(in, inpos, out, outpos);
+      break;
+    case 9:
+      fastpackNoMask9(in, inpos, out, outpos);
+      break;
+    case 10:
+      fastpackNoMask10(in, inpos, out, outpos);
+      break;
+    case 11:
+      fastpackNoMask11(in, inpos, out, outpos);
+      break;
+    case 12:
+      fastpackNoMask12(in, inpos, out, outpos);
+      break;
+    case 13:
+      fastpackNoMask13(in, inpos, out, outpos);
+      break;
+    case 14:
+      fastpackNoMask14(in, inpos, out, outpos);
+      break;
+    case 15:
+      fastpackNoMask15(in, inpos, out, outpos);
+      break;
+    case 16:
+      fastpackNoMask16(in, inpos, out, outpos);
+      break;
+    case 17:
+      fastpackNoMask17(in, inpos, out, outpos);
+      break;
+    case 18:
+      fastpackNoMask18(in, inpos, out, outpos);
+      break;
+    case 19:
+      fastpackNoMask19(in, inpos, out, outpos);
+      break;
+    case 20:
+      fastpackNoMask20(in, inpos, out, outpos);
+      break;
+    case 21:
+      fastpackNoMask21(in, inpos, out, outpos);
+      break;
+    case 22:
+      fastpackNoMask22(in, inpos, out, outpos);
+      break;
+    case 23:
+      fastpackNoMask23(in, inpos, out, outpos);
+      break;
+    case 24:
+      fastpackNoMask24(in, inpos, out, outpos);
+      break;
+    case 25:
+      fastpackNoMask25(in, inpos, out, outpos);
+      break;
+    case 26:
+      fastpackNoMask26(in, inpos, out, outpos);
+      break;
+    case 27:
+      fastpackNoMask27(in, inpos, out, outpos);
+      break;
+    case 28:
+      fastpackNoMask28(in, inpos, out, outpos);
+      break;
+    case 29:
+      fastpackNoMask29(in, inpos, out, outpos);
+      break;
+    case 30:
+      fastpackNoMask30(in, inpos, out, outpos);
+      break;
+    case 31:
+      fastpackNoMask31(in, inpos, out, outpos);
+      break;
+    case 32:
+      System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+      break;
+    }
+  }
+
+  /**
+   * Unpack 32 integers
+   *
+   * @param in
+   *                source array
+   * @param inpos
+   *                position in source array
+   * @param out
+   *                output array
+   * @param outpos
+   *                position in output array
+   * @param b
+   *                number of bits to use per integer
+   */
+  public static void fastunpack(final int[] in, int inpos, final int[] out,
+                                int outpos, int b) {
+    switch (b) {
+    case 0:
+      Arrays.fill(out, outpos, outpos + 256, 0);
+      break;
+    case 1:
+      fastunpack1(in, inpos, out, outpos);
+      break;
+    case 2:
+      fastunpack2(in, inpos, out, outpos);
+      break;
+    case 3:
+      fastunpack3(in, inpos, out, outpos);
+      break;
+    case 4:
+      fastunpack4(in, inpos, out, outpos);
+      break;
+    case 5:
+      fastunpack5(in, inpos, out, outpos);
+      break;
+    case 6:
+      fastunpack6(in, inpos, out, outpos);
+      break;
+    case 7:
+      fastunpack7(in, inpos, out, outpos);
+      break;
+    case 8:
+      fastunpack8(in, inpos, out, outpos);
+      break;
+    case 9:
+      fastunpack9(in, inpos, out, outpos);
+      break;
+    case 10:
+      fastunpack10(in, inpos, out, outpos);
+      break;
+    case 11:
+      fastunpack11(in, inpos, out, outpos);
+      break;
+    case 12:
+      fastunpack12(in, inpos, out, outpos);
+      break;
+    case 13:
+      fastunpack13(in, inpos, out, outpos);
+      break;
+    case 14:
+      fastunpack14(in, inpos, out, outpos);
+      break;
+    case 15:
+      fastunpack15(in, inpos, out, outpos);
+      break;
+    case 16:
+      fastunpack16(in, inpos, out, outpos);
+      break;
+    case 17:
+      fastunpack17(in, inpos, out, outpos);
+      break;
+    case 18:
+      fastunpack18(in, inpos, out, outpos);
+      break;
+    case 19:
+      fastunpack19(in, inpos, out, outpos);
+      break;
+    case 20:
+      fastunpack20(in, inpos, out, outpos);
+      break;
+    case 21:
+      fastunpack21(in, inpos, out, outpos);
+      break;
+    case 22:
+      fastunpack22(in, inpos, out, outpos);
+      break;
+    case 23:
+      fastunpack23(in, inpos, out, outpos);
+      break;
+    case 24:
+      fastunpack24(in, inpos, out, outpos);
+      break;
+    case 25:
+      fastunpack25(in, inpos, out, outpos);
+      break;
+    case 26:
+      fastunpack26(in, inpos, out, outpos);
+      break;
+    case 27:
+      fastunpack27(in, inpos, out, outpos);
+      break;
+    case 28:
+      fastunpack28(in, inpos, out, outpos);
+      break;
+    case 29:
+      fastunpack29(in, inpos, out, outpos);
+      break;
+    case 30:
+      fastunpack30(in, inpos, out, outpos);
+      break;
+    case 31:
+      fastunpack31(in, inpos, out, outpos);
+      break;
+    case 32:
+      System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+      break;
+    }
+  }
+
+  public static int slowpack(final int[] in, int inpos, int inlen,
+                             final int[] out, int outpos, int b) {
+    if (inlen == 0)
+      return outpos;
+    if (b == 32) {
+      System.arraycopy(in, inpos, out, outpos, inlen);
+      return outpos + inlen;
+    }
+    int mask = (1 << b) - 1;
+    int c = 0;
+    int l = 0;
+    int r = 0;
+    int val = 0;
+    for (int i = 0; i < inlen; i++) {
+      val = in[inpos + i] & mask;
+      out[outpos] |= val << (c + r);
+      c += b;
+      l = (32 - r) % b;
+      if (c + r >= 32) {
+        if (i < inlen - 1 || l != 0)
+          outpos++;
+        r = l == 0 ? 0 : b - l;
+        if (l != 0)
+          out[outpos] = val >> (b - r);
+        c = 0;
+      }
+    }
+    return outpos;
+  }
+
+  public static int slowunpack(final int[] in, int inpos, final int[] out,
+                               int outpos, int outlen, int b) {
+    if (outlen == 0) {
+      return inpos;
+    }
+    if (b == 32) {
+      System.arraycopy(in, inpos, out, outpos, outlen);
+      return inpos + outlen;
+    }
+    int mask = (1 << b) - 1;
+    int limit = outpos + outlen;
+    int r = 0;
+    int val = 0;
+    int i = 0;
+    for (; outpos < limit; i++) {
+      if (r > 0)
+        out[outpos++] =
+            (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask);
+      val = in[inpos + i];
+      int j = 0;
+      int l = 32 - r;
+      int ll = l % b == 0 ? l : l - b;
+      while (j < ll && outpos < limit) {
+        out[outpos++] = (val >> (j + r)) & mask;
+        j += b;
+      }
+      r = l % b == 0 ? 0 : b - (l % b);
+    }
+    return inpos + i;
+  }
+
+  public static int numCompressedInts(int n, int b) {
+    int width = b % 2 == 0 ? VLEN_512 : VLEN_256;
+    if (n <= width)
+      return n;
+    int intsPerVec = (32 / b) * width;
+    int q = (n + intsPerVec - 1) / intsPerVec;
+    return q * width;
+  }
+
+  private static void fastpack1(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_1);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack2(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_2);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack3(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_3);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack4(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_4);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack5(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_5);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack6(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_6);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack7(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_7);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack8(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_8);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack9(final int[] in, int inpos, final int[] out,
+                                int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_9);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack10(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_10);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack11(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_11);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack12(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_12);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack13(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_13);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack14(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_14);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack15(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_15);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack16(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_16);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack17(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_17);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack18(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_18);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack19(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_19);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack20(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_20);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack21(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_21);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack22(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_22);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack23(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_23);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack24(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_24);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack25(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_25);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack26(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_26);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack27(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_27);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack28(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_28);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack29(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_29);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack30(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(MASK_30);
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpack31(final int[] in, int inpos, final int[] out,
+                                 int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(MASK_31);
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask1(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask2(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask3(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask4(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask5(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask6(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask7(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask8(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask9(final int[] in, int inpos,
+                                      final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask10(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask11(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask12(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask13(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask14(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask15(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask16(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask17(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask18(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask19(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask20(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask21(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask22(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask23(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask24(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask25(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask26(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask27(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask28(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.or(oV);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask29(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask30(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastpackNoMask31(final int[] in, int inpos,
+                                       final int[] out, int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+    oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+  }
+
+  private static void fastunpack1(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack2(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack3(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack4(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.and(MASK_4);
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xf).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack5(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack6(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack7(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack8(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.and(MASK_8);
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack9(final int[] in, int inpos, final int[] out,
+                                  int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack10(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack11(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack12(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xfff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack13(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack14(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack15(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack16(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.and(MASK_16);
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0xffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack17(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack18(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack19(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack20(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xfffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack21(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack22(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_22).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack23(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack24(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_24).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xffffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0xffffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(0xffffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack25(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack26(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_26).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack27(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_27).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack28(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_28).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = oV.zero(SPECIES_512);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0xfffffff).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack29(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_29).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+
+  private static void fastunpack30(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    iV.and(MASK_30).intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+    oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+    oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30);
+
+    iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos);
+    outpos += VLEN_512;
+  }
+
+  private static void fastunpack31(final int[] in, int inpos, final int[] out,
+                                   int outpos) {
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    iV.and(MASK_31).intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    var oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+    oV = iV.and(0x3fffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+    oV = iV.and(0x1fffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+    oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+    oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+    oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+    oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+    oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+    oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+    oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+    oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+    oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+    oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+    oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+    oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+    oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+    oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+    oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+    oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+    oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+    oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+    oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+    oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+    oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+    oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+    oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+    oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+    oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+    oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+    oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31);
+
+    iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+    oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos);
+    outpos += VLEN_256;
+  }
+}
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java
new file mode 100644
index 0000000..62a8cc7
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java
@@ -0,0 +1,963 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+package me.lemire.integercompression.vector;
+
+import java.util.Arrays;
+import jdk.incubator.vector.*;
+
+/**
+ * This is a readable but less efficient version of the VectorBitPacker class.
+ *
+ */
+public class VectorBitPackerTerse {
+  static final VectorSpecies<Integer> SPECIES_512 = IntVector.SPECIES_512;
+  static final VectorSpecies<Integer> SPECIES_256 = IntVector.SPECIES_256;
+  static final int VLEN_512 = 16;
+  static final int VLEN_256 = 8;
+  static final int BLOCK_SIZE = 256;
+
+  private static void fastpackOddBit(final int[] in, int inpos, final int[] out,
+                                     int outpos, int b, final int[] ho,
+                                     final int[] lc) {
+    final int mask = (1 << b) - 1;
+    final int N = 31 / b;
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV.and(mask);
+    int n = 1;
+    for (; n <= N; n++) {
+      iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+      oV = iV.and(mask).lanewise(VectorOperators.LSHL, b * n).or(oV);
+    }
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    final int L = b - 1;
+    for (int i = 0; i < L; i++) {
+      oV = iV.and(mask).lanewise(VectorOperators.LSHR, ho[i]);
+      for (int j = 0; j < lc[i]; j++) {
+        iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+        oV = iV.and(mask)
+                 .lanewise(VectorOperators.LSHL, b * j + (b - ho[i]))
+                 .or(oV);
+        n++;
+      }
+      oV.intoArray(out, outpos);
+      outpos += VLEN_256;
+    }
+  }
+
+  private static void fastpackOddBitNoMask(final int[] in, int inpos,
+                                           final int[] out, int outpos, int b,
+                                           final int[] ho, final int[] lc) {
+    final int N = 31 / b;
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    var oV = iV;
+    int n = 1;
+    for (; n <= N; n++) {
+      iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+      oV = iV.lanewise(VectorOperators.LSHL, b * n).or(oV);
+    }
+    oV.intoArray(out, outpos);
+    outpos += VLEN_256;
+
+    final int L = b - 1;
+    for (int i = 0; i < L; i++) {
+      oV = iV.lanewise(VectorOperators.LSHR, ho[i]);
+      for (int j = 0; j < lc[i]; j++) {
+        iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+        oV = iV.lanewise(VectorOperators.LSHL, b * j + (b - ho[i])).or(oV);
+        n++;
+      }
+      oV.intoArray(out, outpos);
+      outpos += VLEN_256;
+    }
+  }
+
+  private static void fastUnpackOddBit(final int[] in, int inpos,
+                                       final int[] out, int outpos, int b,
+                                       final int[] lo, int[] masks, int[] lc) {
+    final int mask = (1 << b) - 1;
+    final int N = 32 / b;
+    var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+    int n = 0;
+    for (; n < N; n++) {
+      iV.lanewise(VectorOperators.LSHR, b * n).and(mask).intoArray(out, outpos);
+      outpos += VLEN_256;
+    }
+    var oV = iV.lanewise(VectorOperators.LSHR, b * n).and(mask);
+
+    final int L = b - 1;
+    for (int i = 0; i < L; i++) {
+      iV = IntVector.fromArray(SPECIES_256, in, inpos + (i + 1) * VLEN_256);
+      oV = iV.and(masks[i]).lanewise(VectorOperators.LSHL, b - lo[i]).or(oV);
+      oV.intoArray(out, outpos);
+      outpos += VLEN_256;
+      int j = 0;
+      for (; j < lc[i]; j++) {
+        iV.lanewise(VectorOperators.LSHR, b * j + lo[i])
+            .and(mask)
+            .intoArray(out, outpos);
+        outpos += VLEN_256;
+        n++;
+      }
+      oV = iV.lanewise(VectorOperators.LSHR, b * j + lo[i]).and(mask);
+    }
+  }
+
+  private static void fastpackEvenBit(final int[] in, int inpos,
+                                      final int[] out, int outpos, int b,
+                                      final int[] ho, final int[] lc) {
+    final int mask = (1 << b) - 1;
+    final int N = 32 % b == 0 ? (32 / b) - 1 : 32 / b;
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV.and(mask);
+    int n = 1;
+    for (; n <= N; n++) {
+      iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+      oV = iV.and(mask).lanewise(VectorOperators.LSHL, b * n).or(oV);
+    }
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    final int L = (b >>> 1) - 1;
+    for (int i = 0; i < L; i++) {
+      if (ho[i] != b)
+        oV = iV.and(mask).lanewise(VectorOperators.LSHR, ho[i]);
+      else
+        oV = oV.zero(SPECIES_512);
+      for (int j = 0; j < lc[i]; j++) {
+        iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+        oV = iV.and(mask)
+                 .lanewise(VectorOperators.LSHL, b * j + (b - ho[i]))
+                 .or(oV);
+        n++;
+      }
+      oV.intoArray(out, outpos);
+      outpos += VLEN_512;
+    }
+  }
+
+  private static void fastpackEvenBitNoMask(final int[] in, int inpos,
+                                            final int[] out, int outpos, int b,
+                                            final int[] ho, final int[] lc) {
+    final int N = 32 % b == 0 ? (32 / b) - 1 : 32 / b;
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    var oV = iV;
+    int n = 1;
+    for (; n <= N; n++) {
+      iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+      oV = iV.lanewise(VectorOperators.LSHL, b * n).or(oV);
+    }
+    oV.intoArray(out, outpos);
+    outpos += VLEN_512;
+
+    final int L = (b >>> 1) - 1;
+    for (int i = 0; i < L; i++) {
+      if (ho[i] != b)
+        oV = iV.lanewise(VectorOperators.LSHR, ho[i]);
+      else
+        oV = oV.zero(SPECIES_512);
+      for (int j = 0; j < lc[i]; j++) {
+        iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+        oV = iV.lanewise(VectorOperators.LSHL, b * j + (b - ho[i])).or(oV);
+        n++;
+      }
+      oV.intoArray(out, outpos);
+      outpos += VLEN_512;
+    }
+  }
+
+  private static void fastUnpackEventBit(final int[] in, int inpos,
+                                         final int[] out, int outpos, int b,
+                                         final int[] lo, int[] masks,
+                                         int[] lc) {
+    final int mask = (1 << b) - 1;
+    final int N = 32 / b;
+    var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+    int n = 0;
+    for (; n < N; n++) {
+      iV.lanewise(VectorOperators.LSHR, b * n).and(mask).intoArray(out, outpos);
+      outpos += VLEN_512;
+    }
+    var oV = iV.lanewise(VectorOperators.LSHR, b * n).and(mask);
+    if ((b & (b - 1)) == 0)
+      oV = oV.zero(SPECIES_512);
+
+    final int L = (b >>> 1) - 1;
+    for (int i = 0; i < L; i++) {
+      iV = IntVector.fromArray(SPECIES_512, in, inpos + (i + 1) * VLEN_512);
+      oV = iV.and(masks[i]).lanewise(VectorOperators.LSHL, b - lo[i]).or(oV);
+      oV.intoArray(out, outpos);
+      outpos += VLEN_512;
+      int j = 0;
+      for (; j < lc[i]; j++) {
+        iV.lanewise(VectorOperators.LSHR, b * j + lo[i])
+            .and(mask)
+            .intoArray(out, outpos);
+        outpos += VLEN_512;
+        n++;
+      }
+      if ((32 - lo[i]) % b != 0)
+        oV = iV.lanewise(VectorOperators.LSHR, b * j + lo[i]).and(mask);
+      else
+        oV = oV.zero(SPECIES_512);
+    }
+  }
+
+  public static int slowpack(final int[] in, int inpos, int inlen,
+                             final int[] out, int outpos, int b) {
+    if (inlen == 0)
+      return outpos;
+    if (b == 32) {
+      System.arraycopy(in, inpos, out, outpos, inlen);
+      return outpos + inlen;
+    }
+    int mask = (1 << b) - 1;
+    int c = 0;
+    int l = 0;
+    int r = 0;
+    int val = 0;
+    for (int i = 0; i < inlen; i++) {
+      val = in[inpos + i] & mask;
+      out[outpos] |= val << (c + r);
+      c += b;
+      l = (32 - r) % b;
+      if (c + r >= 32) {
+        if (i < inlen - 1 || l != 0)
+          outpos++;
+        r = l == 0 ? 0 : b - l;
+        if (l != 0)
+          out[outpos] = val >> (b - r);
+        c = 0;
+      }
+    }
+    return outpos;
+  }
+
+  public static int slowunpack(final int[] in, int inpos, final int[] out,
+                               int outpos, int outlen, int b) {
+    if (outlen == 0) {
+      return inpos;
+    }
+    if (b == 32) {
+      System.arraycopy(in, inpos, out, outpos, outlen);
+      return inpos + outlen;
+    }
+    int mask = (1 << b) - 1;
+    int limit = outpos + outlen;
+    int r = 0;
+    int val = 0;
+    int i = 0;
+    for (; outpos < limit; i++) {
+      if (r > 0)
+        out[outpos++] =
+            (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask);
+      val = in[inpos + i];
+      int j = 0;
+      int l = 32 - r;
+      int ll = l % b == 0 ? l : l - b;
+      while (j < ll && outpos < limit) {
+        out[outpos++] = (val >> (j + r)) & mask;
+        j += b;
+      }
+      r = l % b == 0 ? 0 : b - (l % b);
+    }
+    return inpos + i;
+  }
+
+  public static int numCompressedInts(int n, int b) {
+    int width = b % 2 == 0 ? VLEN_512 : VLEN_256;
+    if (n <= width)
+      return n;
+    int intsPerVec = (32 / b) * width;
+    int q = (n + intsPerVec - 1) / intsPerVec;
+    return q * width;
+  }
+
+  public static void fastpack(final int[] in, int inpos, final int[] out,
+                              int outpos, int b) {
+    switch (b) {
+    case 0:
+      break;
+    case 1:
+      fastpackOddBit(in, inpos, out, outpos, 1, new int[] {}, new int[] {});
+      break;
+    case 2:
+      fastpackEvenBit(in, inpos, out, outpos, 2, new int[] {}, new int[] {});
+      break;
+    case 3:
+      fastpackOddBit(in, inpos, out, outpos, 3, new int[] {0x2, 0x1},
+                     new int[] {0xb, 0xa});
+      break;
+    case 4:
+      fastpackEvenBit(in, inpos, out, outpos, 4, new int[] {0x4},
+                      new int[] {0x8});
+      break;
+    case 5:
+      fastpackOddBit(in, inpos, out, outpos, 5, new int[] {0x2, 0x4, 0x1, 0x3},
+                     new int[] {0x6, 0x7, 0x6, 0x6});
+      break;
+    case 6:
+      fastpackEvenBit(in, inpos, out, outpos, 6, new int[] {0x2, 0x4},
+                      new int[] {0x5, 0x5});
+      break;
+    case 7:
+      fastpackOddBit(in, inpos, out, outpos, 7,
+                     new int[] {0x4, 0x1, 0x5, 0x2, 0x6, 0x3},
+                     new int[] {0x5, 0x4, 0x5, 0x4, 0x5, 0x4});
+      break;
+    case 8:
+      fastpackEvenBit(in, inpos, out, outpos, 8, new int[] {0x8, 0x8, 0x8},
+                      new int[] {0x4, 0x4, 0x4});
+      break;
+    case 9:
+      fastpackOddBit(in, inpos, out, outpos, 9,
+                     new int[] {0x5, 0x1, 0x6, 0x2, 0x7, 0x3, 0x8, 0x4},
+                     new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x3, 0x4, 0x3});
+      break;
+    case 10:
+      fastpackEvenBit(in, inpos, out, outpos, 10,
+                      new int[] {0x2, 0x4, 0x6, 0x8},
+                      new int[] {0x3, 0x3, 0x3, 0x3});
+      break;
+    case 11:
+      fastpackOddBit(
+          in, inpos, out, outpos, 11,
+          new int[] {0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+          new int[] {0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2});
+      break;
+    case 12:
+      fastpackEvenBit(in, inpos, out, outpos, 12,
+                      new int[] {0x8, 0x4, 0xc, 0x8, 0x4},
+                      new int[] {0x3, 0x2, 0x3, 0x3, 0x2});
+      break;
+    case 13:
+      fastpackOddBit(in, inpos, out, outpos, 13,
+                     new int[] {0x6, 0xc, 0x5, 0xb, 0x4, 0xa, 0x3, 0x9, 0x2,
+                                0x8, 0x1, 0x7},
+                     new int[] {0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2,
+                                0x3, 0x2, 0x2});
+      break;
+    case 14:
+      fastpackEvenBit(in, inpos, out, outpos, 14,
+                      new int[] {0x4, 0x8, 0xc, 0x2, 0x6, 0xa},
+                      new int[] {0x2, 0x2, 0x3, 0x2, 0x2, 0x2});
+      break;
+    case 15:
+      fastpackOddBit(in, inpos, out, outpos, 15,
+                     new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x1, 0x3,
+                                0x5, 0x7, 0x9, 0xb, 0xd},
+                     new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2, 0x2,
+                                0x2, 0x2, 0x2, 0x2, 0x2});
+      break;
+    case 16:
+      fastpackEvenBit(in, inpos, out, outpos, 16,
+                      new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+                      new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+      break;
+    case 17:
+      fastpackOddBit(in, inpos, out, outpos, 17,
+                     new int[] {0xf, 0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1, 0x10,
+                                0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+                     new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x2,
+                                0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1});
+      break;
+    case 18:
+      fastpackEvenBit(in, inpos, out, outpos, 18,
+                      new int[] {0xe, 0xa, 0x6, 0x2, 0x10, 0xc, 0x8, 0x4},
+                      new int[] {0x2, 0x2, 0x2, 0x1, 0x2, 0x2, 0x2, 0x1});
+      break;
+    case 19:
+      fastpackOddBit(in, inpos, out, outpos, 19,
+                     new int[] {0xd, 0x7, 0x1, 0xe, 0x8, 0x2, 0xf, 0x9, 0x3,
+                                0x10, 0xa, 0x4, 0x11, 0xb, 0x5, 0x12, 0xc, 0x6},
+                     new int[] {0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1,
+                                0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1});
+      break;
+    case 20:
+      fastpackEvenBit(
+          in, inpos, out, outpos, 20,
+          new int[] {0xc, 0x4, 0x10, 0x8, 0x14, 0xc, 0x4, 0x10, 0x8},
+          new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x1});
+      break;
+    case 21:
+      fastpackOddBit(
+          in, inpos, out, outpos, 21,
+          new int[] {0xb,  0x1, 0xc,  0x2, 0xd,  0x3, 0xe,  0x4, 0xf,  0x5,
+                     0x10, 0x6, 0x11, 0x7, 0x12, 0x8, 0x13, 0x9, 0x14, 0xa},
+          new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+                     0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1});
+      break;
+    case 22:
+      fastpackEvenBit(
+          in, inpos, out, outpos, 22,
+          new int[] {0xa, 0x14, 0x8, 0x12, 0x6, 0x10, 0x4, 0xe, 0x2, 0xc},
+          new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1});
+      break;
+    case 23:
+      fastpackOddBit(in, inpos, out, outpos, 23,
+                     new int[] {0x9, 0x12, 0x4, 0xd,  0x16, 0x8, 0x11, 0x3,
+                                0xc, 0x15, 0x7, 0x10, 0x2,  0xb, 0x14, 0x6,
+                                0xf, 0x1,  0xa, 0x13, 0x5,  0xe},
+                     new int[] {0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1,
+                                0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1,
+                                0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+      break;
+    case 24:
+      fastpackEvenBit(
+          in, inpos, out, outpos, 24,
+          new int[] {0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8,
+                     0x10},
+          new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+      break;
+    case 25:
+      fastpackOddBit(in, inpos, out, outpos, 25,
+                     new int[] {0x7,  0xe,  0x15, 0x3, 0xa,  0x11, 0x18, 0x6,
+                                0xd,  0x14, 0x2,  0x9, 0x10, 0x17, 0x5,  0xc,
+                                0x13, 0x1,  0x8,  0xf, 0x16, 0x4,  0xb,  0x12},
+                     new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+                                0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+                                0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1});
+      break;
+    case 26:
+      fastpackEvenBit(in, inpos, out, outpos, 26,
+                      new int[] {0x6, 0xc, 0x12, 0x18, 0x4, 0xa, 0x10, 0x16,
+                                 0x2, 0x8, 0xe, 0x14},
+                      new int[] {0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+                                 0x1, 0x1, 0x1});
+      break;
+    case 27:
+      fastpackOddBit(in, inpos, out, outpos, 27,
+                     new int[] {0x5,  0xa,  0xf,  0x14, 0x19, 0x3,  0x8,
+                                0xd,  0x12, 0x17, 0x1,  0x6,  0xb,  0x10,
+                                0x15, 0x1a, 0x4,  0x9,  0xe,  0x13, 0x18,
+                                0x2,  0x7,  0xc,  0x11, 0x16},
+                     new int[] {0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1,
+                                0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+                                0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 28:
+      fastpackEvenBit(in, inpos, out, outpos, 28,
+                      new int[] {0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x4,
+                                 0x8, 0xc, 0x10, 0x14, 0x18},
+                      new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+                                 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 29:
+      fastpackOddBit(
+          in, inpos, out, outpos, 29,
+          new int[] {0x3, 0x6, 0x9, 0xc, 0xf,  0x12, 0x15, 0x18, 0x1b, 0x1,
+                     0x4, 0x7, 0xa, 0xd, 0x10, 0x13, 0x16, 0x19, 0x1c, 0x2,
+                     0x5, 0x8, 0xb, 0xe, 0x11, 0x14, 0x17, 0x1a},
+          new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+                     0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+                     0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 30:
+      fastpackEvenBit(in, inpos, out, outpos, 30,
+                      new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12,
+                                 0x14, 0x16, 0x18, 0x1a, 0x1c},
+                      new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                                 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 31:
+      fastpackOddBit(in, inpos, out, outpos, 31,
+                     new int[] {0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,
+                                0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0x10,
+                                0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+                                0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e},
+                     new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                                0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                                0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                                0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 32:
+      System.arraycopy(in, inpos, out, outpos, 256);
+      break;
+    }
+  }
+
+  public static void fastpackNoMask(final int[] in, int inpos, final int[] out,
+                                    int outpos, int b) {
+    switch (b) {
+    case 0:
+      break;
+    case 1:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 1, new int[] {},
+                           new int[] {});
+      break;
+    case 2:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 2, new int[] {},
+                            new int[] {});
+      break;
+    case 3:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 3, new int[] {0x2, 0x1},
+                           new int[] {0xb, 0xa});
+      break;
+    case 4:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 4, new int[] {0x4},
+                            new int[] {0x8});
+      break;
+    case 5:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 5,
+                           new int[] {0x2, 0x4, 0x1, 0x3},
+                           new int[] {0x6, 0x7, 0x6, 0x6});
+      break;
+    case 6:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 6, new int[] {0x2, 0x4},
+                            new int[] {0x5, 0x5});
+      break;
+    case 7:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 7,
+                           new int[] {0x4, 0x1, 0x5, 0x2, 0x6, 0x3},
+                           new int[] {0x5, 0x4, 0x5, 0x4, 0x5, 0x4});
+      break;
+    case 8:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 8,
+                            new int[] {0x8, 0x8, 0x8},
+                            new int[] {0x4, 0x4, 0x4});
+      break;
+    case 9:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 9,
+                           new int[] {0x5, 0x1, 0x6, 0x2, 0x7, 0x3, 0x8, 0x4},
+                           new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x3, 0x4, 0x3});
+      break;
+    case 10:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 10,
+                            new int[] {0x2, 0x4, 0x6, 0x8},
+                            new int[] {0x3, 0x3, 0x3, 0x3});
+      break;
+    case 11:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 11,
+          new int[] {0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+          new int[] {0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2});
+      break;
+    case 12:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 12,
+                            new int[] {0x8, 0x4, 0xc, 0x8, 0x4},
+                            new int[] {0x3, 0x2, 0x3, 0x3, 0x2});
+      break;
+    case 13:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 13,
+                           new int[] {0x6, 0xc, 0x5, 0xb, 0x4, 0xa, 0x3, 0x9,
+                                      0x2, 0x8, 0x1, 0x7},
+                           new int[] {0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3,
+                                      0x2, 0x3, 0x2, 0x2});
+      break;
+    case 14:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 14,
+                            new int[] {0x4, 0x8, 0xc, 0x2, 0x6, 0xa},
+                            new int[] {0x2, 0x2, 0x3, 0x2, 0x2, 0x2});
+      break;
+    case 15:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 15,
+                           new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x1,
+                                      0x3, 0x5, 0x7, 0x9, 0xb, 0xd},
+                           new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2,
+                                      0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+      break;
+    case 16:
+      fastpackEvenBitNoMask(
+          in, inpos, out, outpos, 16,
+          new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+          new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+      break;
+    case 17:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 17,
+                           new int[] {0xf, 0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1,
+                                      0x10, 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+                           new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1,
+                                      0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1});
+      break;
+    case 18:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 18,
+                            new int[] {0xe, 0xa, 0x6, 0x2, 0x10, 0xc, 0x8, 0x4},
+                            new int[] {0x2, 0x2, 0x2, 0x1, 0x2, 0x2, 0x2, 0x1});
+      break;
+    case 19:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 19,
+          new int[] {0xd, 0x7, 0x1, 0xe, 0x8, 0x2, 0xf, 0x9, 0x3, 0x10, 0xa,
+                     0x4, 0x11, 0xb, 0x5, 0x12, 0xc, 0x6},
+          new int[] {0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1,
+                     0x2, 0x2, 0x1, 0x2, 0x2, 0x1});
+      break;
+    case 20:
+      fastpackEvenBitNoMask(
+          in, inpos, out, outpos, 20,
+          new int[] {0xc, 0x4, 0x10, 0x8, 0x14, 0xc, 0x4, 0x10, 0x8},
+          new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x1});
+      break;
+    case 21:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 21,
+          new int[] {0xb,  0x1, 0xc,  0x2, 0xd,  0x3, 0xe,  0x4, 0xf,  0x5,
+                     0x10, 0x6, 0x11, 0x7, 0x12, 0x8, 0x13, 0x9, 0x14, 0xa},
+          new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+                     0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1});
+      break;
+    case 22:
+      fastpackEvenBitNoMask(
+          in, inpos, out, outpos, 22,
+          new int[] {0xa, 0x14, 0x8, 0x12, 0x6, 0x10, 0x4, 0xe, 0x2, 0xc},
+          new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1});
+      break;
+    case 23:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 23,
+          new int[] {0x9, 0x12, 0x4, 0xd,  0x16, 0x8, 0x11, 0x3,
+                     0xc, 0x15, 0x7, 0x10, 0x2,  0xb, 0x14, 0x6,
+                     0xf, 0x1,  0xa, 0x13, 0x5,  0xe},
+          new int[] {0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1,
+                     0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+      break;
+    case 24:
+      fastpackEvenBitNoMask(
+          in, inpos, out, outpos, 24,
+          new int[] {0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8,
+                     0x10},
+          new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+      break;
+    case 25:
+      fastpackOddBitNoMask(in, inpos, out, outpos, 25,
+                           new int[] {0x7,  0xe,  0x15, 0x3,  0xa,  0x11,
+                                      0x18, 0x6,  0xd,  0x14, 0x2,  0x9,
+                                      0x10, 0x17, 0x5,  0xc,  0x13, 0x1,
+                                      0x8,  0xf,  0x16, 0x4,  0xb,  0x12},
+                           new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+                                      0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+                                      0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1});
+      break;
+    case 26:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 26,
+                            new int[] {0x6, 0xc, 0x12, 0x18, 0x4, 0xa, 0x10,
+                                       0x16, 0x2, 0x8, 0xe, 0x14},
+                            new int[] {0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2,
+                                       0x1, 0x1, 0x1, 0x1});
+      break;
+    case 27:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 27,
+          new int[] {0x5,  0xa,  0xf,  0x14, 0x19, 0x3,  0x8,  0xd, 0x12,
+                     0x17, 0x1,  0x6,  0xb,  0x10, 0x15, 0x1a, 0x4, 0x9,
+                     0xe,  0x13, 0x18, 0x2,  0x7,  0xc,  0x11, 0x16},
+          new int[] {0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1,
+                     0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+                     0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 28:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 28,
+                            new int[] {0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c,
+                                       0x4, 0x8, 0xc, 0x10, 0x14, 0x18},
+                            new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+                                       0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 29:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 29,
+          new int[] {0x3, 0x6, 0x9, 0xc, 0xf,  0x12, 0x15, 0x18, 0x1b, 0x1,
+                     0x4, 0x7, 0xa, 0xd, 0x10, 0x13, 0x16, 0x19, 0x1c, 0x2,
+                     0x5, 0x8, 0xb, 0xe, 0x11, 0x14, 0x17, 0x1a},
+          new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+                     0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+                     0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 30:
+      fastpackEvenBitNoMask(in, inpos, out, outpos, 30,
+                            new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10,
+                                       0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c},
+                            new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                                       0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 31:
+      fastpackOddBitNoMask(
+          in, inpos, out, outpos, 31,
+          new int[] {0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,
+                     0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0x10,
+                     0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+                     0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e},
+          new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                     0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                     0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 32:
+      System.arraycopy(in, inpos, out, outpos, 256);
+      break;
+    }
+  }
+
+  public static void fastunpack(final int[] in, int inpos, final int[] out,
+                                int outpos, int b) {
+    switch (b) {
+    case 0:
+      Arrays.fill(out, outpos, outpos + 256, 0);
+      break;
+    case 1:
+      fastUnpackOddBit(in, inpos, out, outpos, 1, new int[] {}, new int[] {},
+                       new int[] {});
+      break;
+    case 2:
+      fastUnpackEventBit(in, inpos, out, outpos, 2, new int[] {}, new int[] {},
+                         new int[] {});
+      break;
+    case 3:
+      fastUnpackOddBit(in, inpos, out, outpos, 3, new int[] {0x1, 0x2},
+                       new int[] {0x1, 0x3}, new int[] {0xa, 0xa});
+      break;
+    case 4:
+      fastUnpackEventBit(in, inpos, out, outpos, 4, new int[] {0x4},
+                         new int[] {0xf}, new int[] {0x7});
+      break;
+    case 5:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 5, new int[] {0x3, 0x1, 0x4, 0x2},
+          new int[] {0x7, 0x1, 0xf, 0x3}, new int[] {0x5, 0x6, 0x5, 0x6});
+      break;
+    case 6:
+      fastUnpackEventBit(in, inpos, out, outpos, 6, new int[] {0x4, 0x2},
+                         new int[] {0xf, 0x3}, new int[] {0x4, 0x5});
+      break;
+    case 7:
+      fastUnpackOddBit(in, inpos, out, outpos, 7,
+                       new int[] {0x3, 0x6, 0x2, 0x5, 0x1, 0x4},
+                       new int[] {0x7, 0x3f, 0x3, 0x1f, 0x1, 0xf},
+                       new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x4});
+      break;
+    case 8:
+      fastUnpackEventBit(in, inpos, out, outpos, 8, new int[] {0x8, 0x8, 0x8},
+                         new int[] {0xff, 0xff, 0xff},
+                         new int[] {0x3, 0x3, 0x3});
+      break;
+    case 9:
+      fastUnpackOddBit(in, inpos, out, outpos, 9,
+                       new int[] {0x4, 0x8, 0x3, 0x7, 0x2, 0x6, 0x1, 0x5},
+                       new int[] {0xf, 0xff, 0x7, 0x7f, 0x3, 0x3f, 0x1, 0x1f},
+                       new int[] {0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x3});
+      break;
+    case 10:
+      fastUnpackEventBit(
+          in, inpos, out, outpos, 10, new int[] {0x8, 0x6, 0x4, 0x2},
+          new int[] {0xff, 0x3f, 0xf, 0x3}, new int[] {0x2, 0x2, 0x2, 0x3});
+      break;
+    case 11:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 11,
+          new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa},
+          new int[] {0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff},
+          new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+      break;
+    case 12:
+      fastUnpackEventBit(in, inpos, out, outpos, 12,
+                         new int[] {0x4, 0x8, 0xc, 0x4, 0x8},
+                         new int[] {0xf, 0xff, 0xfff, 0xf, 0xff},
+                         new int[] {0x2, 0x2, 0x1, 0x2, 0x2});
+      break;
+    case 13:
+      fastUnpackOddBit(in, inpos, out, outpos, 13,
+                       new int[] {0x7, 0x1, 0x8, 0x2, 0x9, 0x3, 0xa, 0x4, 0xb,
+                                  0x5, 0xc, 0x6},
+                       new int[] {0x7f, 0x1, 0xff, 0x3, 0x1ff, 0x7, 0x3ff, 0xf,
+                                  0x7ff, 0x1f, 0xfff, 0x3f},
+                       new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+                                  0x2, 0x1, 0x2});
+      break;
+    case 14:
+      fastUnpackEventBit(in, inpos, out, outpos, 14,
+                         new int[] {0xa, 0x6, 0x2, 0xc, 0x8, 0x4},
+                         new int[] {0x3ff, 0x3f, 0x3, 0xfff, 0xff, 0xf},
+                         new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2});
+      break;
+    case 15:
+      fastUnpackOddBit(in, inpos, out, outpos, 15,
+                       new int[] {0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1, 0xe, 0xc,
+                                  0xa, 0x8, 0x6, 0x4, 0x2},
+                       new int[] {0x1fff, 0x7ff, 0x1ff, 0x7f, 0x1f, 0x7, 0x1,
+                                  0x3fff, 0xfff, 0x3ff, 0xff, 0x3f, 0xf, 0x3},
+                       new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+                                  0x1, 0x1, 0x1, 0x1, 0x2});
+      break;
+    case 16:
+      fastUnpackEventBit(
+          in, inpos, out, outpos, 16,
+          new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+          new int[] {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+          new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 17:
+      fastUnpackOddBit(in, inpos, out, outpos, 17,
+                       new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x1,
+                                  0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf},
+                       new int[] {0x3, 0xf, 0x3f, 0xff, 0x3ff, 0xfff, 0x3fff,
+                                  0xffff, 0x1, 0x7, 0x1f, 0x7f, 0x1ff, 0x7ff,
+                                  0x1fff, 0x7fff},
+                       new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x1,
+                                  0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 18:
+      fastUnpackEventBit(
+          in, inpos, out, outpos, 18,
+          new int[] {0x4, 0x8, 0xc, 0x10, 0x2, 0x6, 0xa, 0xe},
+          new int[] {0xf, 0xff, 0xfff, 0xffff, 0x3, 0x3f, 0x3ff, 0x3fff},
+          new int[] {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1});
+      break;
+    case 19:
+      fastUnpackOddBit(in, inpos, out, outpos, 19,
+                       new int[] {0x6, 0xc, 0x12, 0x5, 0xb, 0x11, 0x4, 0xa,
+                                  0x10, 0x3, 0x9, 0xf, 0x2, 0x8, 0xe, 0x1, 0x7,
+                                  0xd},
+                       new int[] {0x3f, 0xfff, 0x3ffff, 0x1f, 0x7ff, 0x1ffff,
+                                  0xf, 0x3ff, 0xffff, 0x7, 0x1ff, 0x7fff, 0x3,
+                                  0xff, 0x3fff, 0x1, 0x7f, 0x1fff},
+                       new int[] {0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x0,
+                                  0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1});
+      break;
+    case 20:
+      fastUnpackEventBit(
+          in, inpos, out, outpos, 20,
+          new int[] {0x8, 0x10, 0x4, 0xc, 0x14, 0x8, 0x10, 0x4, 0xc},
+          new int[] {0xff, 0xffff, 0xf, 0xfff, 0xfffff, 0xff, 0xffff, 0xf,
+                     0xfff},
+          new int[] {0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1});
+      break;
+    case 21:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 21,
+          new int[] {0xa, 0x14, 0x9, 0x13, 0x8, 0x12, 0x7, 0x11, 0x6, 0x10,
+                     0x5, 0xf,  0x4, 0xe,  0x3, 0xd,  0x2, 0xc,  0x1, 0xb},
+          new int[] {0x3ff,   0xfffff, 0x1ff,  0x7ffff, 0xff,   0x3ffff, 0x7f,
+                     0x1ffff, 0x3f,    0xffff, 0x1f,    0x7fff, 0xf,     0x3fff,
+                     0x7,     0x1fff,  0x3,    0xfff,   0x1,    0x7ff},
+          new int[] {0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0,
+                     0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1});
+      break;
+    case 22:
+      fastUnpackEventBit(
+          in, inpos, out, outpos, 22,
+          new int[] {0xc, 0x2, 0xe, 0x4, 0x10, 0x6, 0x12, 0x8, 0x14, 0xa},
+          new int[] {0xfff, 0x3, 0x3fff, 0xf, 0xffff, 0x3f, 0x3ffff, 0xff,
+                     0xfffff, 0x3ff},
+          new int[] {0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1});
+      break;
+    case 23:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 23,
+          new int[] {0xe, 0x5,  0x13, 0xa, 0x1,  0xf, 0x6, 0x14,
+                     0xb, 0x2,  0x10, 0x7, 0x15, 0xc, 0x3, 0x11,
+                     0x8, 0x16, 0xd,  0x4, 0x12, 0x9},
+          new int[] {0x3fff,   0x1f,    0x7ffff, 0x3ff,   0x1,    0x7fff,
+                     0x3f,     0xfffff, 0x7ff,   0x3,     0xffff, 0x7f,
+                     0x1fffff, 0xfff,   0x7,     0x1ffff, 0xff,   0x3fffff,
+                     0x1fff,   0xf,     0x3ffff, 0x1ff},
+          new int[] {0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0,
+                     0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1});
+      break;
+    case 24:
+      fastUnpackEventBit(
+          in, inpos, out, outpos, 24,
+          new int[] {0x10, 0x8, 0x18, 0x10, 0x8, 0x18, 0x10, 0x8, 0x18, 0x10,
+                     0x8},
+          new int[] {0xffff, 0xff, 0xffffff, 0xffff, 0xff, 0xffffff, 0xffff,
+                     0xff, 0xffffff, 0xffff, 0xff},
+          new int[] {0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1});
+      break;
+    case 25:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 25,
+          new int[] {0x12, 0xb,  0x4,  0x16, 0xf, 0x8,  0x1,  0x13,
+                     0xc,  0x5,  0x17, 0x10, 0x9, 0x2,  0x14, 0xd,
+                     0x6,  0x18, 0x11, 0xa,  0x3, 0x15, 0xe,  0x7},
+          new int[] {0x3ffff, 0x7ff,   0xf,     0x3fffff, 0x7fff,   0xff,
+                     0x1,     0x7ffff, 0xfff,   0x1f,     0x7fffff, 0xffff,
+                     0x1ff,   0x3,     0xfffff, 0x1fff,   0x3f,     0xffffff,
+                     0x1ffff, 0x3ff,   0x7,     0x1fffff, 0x3fff,   0x7f},
+          new int[] {0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0,
+                     0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
+                     0x1, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1});
+      break;
+    case 26:
+      fastUnpackEventBit(in, inpos, out, outpos, 26,
+                         new int[] {0x14, 0xe, 0x8, 0x2, 0x16, 0x10, 0xa, 0x4,
+                                    0x18, 0x12, 0xc, 0x6},
+                         new int[] {0xfffff, 0x3fff, 0xff, 0x3, 0x3fffff,
+                                    0xffff, 0x3ff, 0xf, 0xffffff, 0x3ffff,
+                                    0xfff, 0x3f},
+                         new int[] {0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0,
+                                    0x0, 0x0, 0x1});
+      break;
+    case 27:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 27,
+          new int[] {0x16, 0x11, 0xc,  0x7,  0x2,  0x18, 0x13, 0xe,  0x9,
+                     0x4,  0x1a, 0x15, 0x10, 0xb,  0x6,  0x1,  0x17, 0x12,
+                     0xd,  0x8,  0x3,  0x19, 0x14, 0xf,  0xa,  0x5},
+          new int[] {0x3fffff, 0x1ffff, 0xfff, 0x7f,      0x3,       0xffffff,
+                     0x7ffff,  0x3fff,  0x1ff, 0xf,       0x3ffffff, 0x1fffff,
+                     0xffff,   0x7ff,   0x3f,  0x1,       0x7fffff,  0x3ffff,
+                     0x1fff,   0xff,    0x7,   0x1ffffff, 0xfffff,   0x7fff,
+                     0x3ff,    0x1f},
+          new int[] {0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0,
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
+                     0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x1});
+      break;
+    case 28:
+      fastUnpackEventBit(in, inpos, out, outpos, 28,
+                         new int[] {0x18, 0x14, 0x10, 0xc, 0x8, 0x4, 0x1c, 0x18,
+                                    0x14, 0x10, 0xc, 0x8, 0x4},
+                         new int[] {0xffffff, 0xfffff, 0xffff, 0xfff, 0xff, 0xf,
+                                    0xfffffff, 0xffffff, 0xfffff, 0xffff, 0xfff,
+                                    0xff, 0xf},
+                         new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
+                                    0x0, 0x0, 0x0, 0x1});
+      break;
+    case 29:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 29,
+          new int[] {0x1a, 0x17, 0x14, 0x11, 0xe, 0xb, 0x8, 0x5, 0x2, 0x1c,
+                     0x19, 0x16, 0x13, 0x10, 0xd, 0xa, 0x7, 0x4, 0x1, 0x1b,
+                     0x18, 0x15, 0x12, 0xf,  0xc, 0x9, 0x6, 0x3},
+          new int[] {0x3ffffff, 0x7fffff, 0xfffff, 0x1ffff, 0x3fff,
+                     0x7ff,     0xff,     0x1f,    0x3,     0xfffffff,
+                     0x1ffffff, 0x3fffff, 0x7ffff, 0xffff,  0x1fff,
+                     0x3ff,     0x7f,     0xf,     0x1,     0x7ffffff,
+                     0xffffff,  0x1fffff, 0x3ffff, 0x7fff,  0xfff,
+                     0x1ff,     0x3f,     0x7},
+          new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1});
+      break;
+    case 30:
+      fastUnpackEventBit(in, inpos, out, outpos, 30,
+                         new int[] {0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10,
+                                    0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+                         new int[] {0xfffffff, 0x3ffffff, 0xffffff, 0x3fffff,
+                                    0xfffff, 0x3ffff, 0xffff, 0x3fff, 0xfff,
+                                    0x3ff, 0xff, 0x3f, 0xf, 0x3},
+                         new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                                    0x0, 0x0, 0x0, 0x0, 0x1});
+      break;
+    case 31:
+      fastUnpackOddBit(
+          in, inpos, out, outpos, 31,
+          new int[] {0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15,
+                     0x14, 0x13, 0x12, 0x11, 0x10, 0xf,  0xe,  0xd,  0xc,  0xb,
+                     0xa,  0x9,  0x8,  0x7,  0x6,  0x5,  0x4,  0x3,  0x2,  0x1},
+          new int[] {0x3fffffff, 0x1fffffff, 0xfffffff, 0x7ffffff, 0x3ffffff,
+                     0x1ffffff,  0xffffff,   0x7fffff,  0x3fffff,  0x1fffff,
+                     0xfffff,    0x7ffff,    0x3ffff,   0x1ffff,   0xffff,
+                     0x7fff,     0x3fff,     0x1fff,    0xfff,     0x7ff,
+                     0x3ff,      0x1ff,      0xff,      0x7f,      0x3f,
+                     0x1f,       0xf,        0x7,       0x3,       0x1},
+          new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1});
+      break;
+
+    case 32:
+      System.arraycopy(in, inpos, out, outpos, 256);
+      break;
+    }
+  }
+}
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java
new file mode 100644
index 0000000..7374fa5
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java
@@ -0,0 +1,366 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ * (c) Intel Corp. (for Vector implementation)
+ */
+package me.lemire.integercompression.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import me.lemire.integercompression.IntegerCODEC;
+import me.lemire.integercompression.SkippableIntegerCODEC;
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * This is a patching scheme designed for speed.
+ *  It encodes integers in blocks of integers within pages of
+ *  up to 65536 integers. Note that it is important, to get good
+ *  compression and good performance, to use sizeable arrays (greater than 1024
+ * integers). For arrays containing a number of integers that is not divisible
+ * by BLOCK_SIZE, you should use it in conjunction with another CODEC:
+ *
+ *  IntegerCODEC ic = new Composition(new VectorFastPFOR(), new VariableByte()).
+ * <p>
+ * For details, please see:
+ * </p><p>
+ * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second
+ * through vectorization Software: Practice &amp; Experience
+ * <a
+ * href="http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract">http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract</a>
+ * <a href="http://arxiv.org/abs/1209.2137">http://arxiv.org/abs/1209.2137</a>
+ * </p>
+ * <p>For sufficiently compressible and long arrays, it is faster and better
+ * than other PFOR schemes.</p>
+ *
+ * Note that this does not use differential coding: if you are working on sorted
+ * lists, you should first compute deltas, @see
+ * me.lemire.integercompression.differential.Delta#delta.
+ *
+ * For multi-threaded applications, each thread should use its own FastPFOR
+ * object.
+ *
+ * @author Daniel Lemire
+ */
+public class VectorFastPFOR implements IntegerCODEC, SkippableIntegerCODEC {
+  private final static int OVERHEAD_OF_EACH_EXCEPT = 8;
+  public final static int DEFAULT_PAGE_SIZE = 64 << 10;
+
+  public final static int BLOCK_SIZE = 256;
+  private final static int INTS_PER_BLOCK = BLOCK_SIZE >>> 5;
+
+  private final int pageSize;
+  private final int[][] dataTobePacked = new int[33][];
+  private int[] exceptData = null;
+
+  // Working area for compress and uncompress.
+  private final int[] dataPointers = new int[33];
+  private final int[] freqs = new int[33];
+  private final byte[] bem;
+  /**
+   * Construct the FastPFOR CODEC.
+   *
+   * @param pagesize
+   *                the desired page size (recommended value is
+   * FastPFOR.DEFAULT_PAGE_SIZE)
+   */
+  private VectorFastPFOR(int pagesize) {
+    pageSize = pagesize;
+    // Initiate arrrays.
+    bem = new byte[3 * pageSize / BLOCK_SIZE + pagesize];
+    for (int k = 1; k < dataTobePacked.length; ++k)
+      dataTobePacked[k] = new int[pageSize / 32 * 4]; // heuristic
+    exceptData = new int[pageSize * 4];
+  }
+
+  /**
+   * Construct the fastPFOR CODEC with default parameters.
+   */
+  public VectorFastPFOR() { this(DEFAULT_PAGE_SIZE); }
+
+  /**
+   * Compress data in blocks of BLOCK_SIZE integers (if fewer than BLOCK_SIZE
+   * integers are provided, nothing is done).
+   *
+   * @see IntegerCODEC#compress(int[], IntWrapper, int, int[], IntWrapper)
+   */
+  @Override
+  public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
+                               int[] out, IntWrapper outpos) {
+    inlength = inlength - inlength % BLOCK_SIZE;
+    // Allocate memory for working area.
+
+    final int finalinpos = inpos.get() + inlength;
+    while (inpos.get() != finalinpos) {
+      int thissize = Math.min(pageSize, finalinpos - inpos.get());
+      encodePage(in, inpos, thissize, out, outpos);
+    }
+  }
+
+  private void getBestBitSize(int[] in, int pos, int index) {
+    Arrays.fill(freqs, 0);
+    for (int i = pos, limit = pos + BLOCK_SIZE; i < limit; i++) {
+      freqs[32 - Integer.numberOfLeadingZeros(in[i])]++;
+    }
+    bem[index] = 32;
+    while (freqs[bem[index]] == 0)
+      bem[index]--;
+    bem[index + 2] = bem[index];
+    int maxb = bem[index + 2];
+    int bestcost = bem[index] * BLOCK_SIZE;
+    int cexcept = 0;
+    bem[index + 1] = 0;
+    for (int b = bem[index] - 1; b >= 0; --b) {
+      cexcept += freqs[b + 1];
+      if (cexcept == BLOCK_SIZE)
+        break;
+      // the extra 8 is the cost of storing maxbits
+      int thiscost = cexcept * OVERHEAD_OF_EACH_EXCEPT + cexcept * (maxb - b) +
+                     b * BLOCK_SIZE + 8;
+      if (maxb - b == 1)
+        thiscost -= cexcept;
+      if (thiscost < bestcost) {
+        bestcost = thiscost;
+        bem[index] = (byte)b;
+        bem[index + 1] = (byte)cexcept;
+      }
+    }
+  }
+
+  private void encodePage(int[] in, IntWrapper inpos, int thissize, int[] out,
+                          IntWrapper outpos) {
+    final int headerpos = outpos.get();
+    outpos.increment();
+    int tmpoutpos = outpos.get();
+
+    // Clear working area.
+    Arrays.fill(dataPointers, 0);
+    Arrays.fill(bem, (byte)0);
+
+    int tmpinpos = inpos.get();
+    final int finalinpos = tmpinpos + thissize - BLOCK_SIZE;
+    int bindex = 0;
+    for (; tmpinpos <= finalinpos; tmpinpos += BLOCK_SIZE) {
+      getBestBitSize(in, tmpinpos, bindex);
+      final int tmpexcept = bem[bindex + 1] & 0xFF;
+      final int tmpbestb = bem[bindex];
+      if (tmpexcept > 0) {
+        final int index = bem[bindex + 2] - tmpbestb;
+        if (dataPointers[index] + tmpexcept >= dataTobePacked[index].length) {
+          int newsize = 2 * (dataPointers[index] + tmpexcept);
+          int val = newsize + BLOCK_SIZE - 1;
+          newsize = val - val % BLOCK_SIZE;
+          dataTobePacked[index] = Arrays.copyOf(dataTobePacked[index], newsize);
+        }
+        bindex += 3;
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+          if ((in[k + tmpinpos] >>> tmpbestb) != 0) {
+            // we have an exception
+            bem[bindex++] = (byte)k;
+            dataTobePacked[index][dataPointers[index]++] =
+                in[k + tmpinpos] >>> tmpbestb;
+          }
+        }
+      } else {
+        bindex += 2;
+      }
+      VectorBitPacker.fastpack(in, tmpinpos, out, tmpoutpos, tmpbestb);
+      tmpoutpos += INTS_PER_BLOCK * tmpbestb;
+    }
+    inpos.set(tmpinpos);
+    out[headerpos] = tmpoutpos - headerpos;
+
+    int bytesize = bindex;
+    out[tmpoutpos++] = bytesize;
+
+    bytesize = bytesize % 4 == 0 ? bytesize : (bytesize / 4) * 4 + 4;
+    for (int i = 0; i <= bytesize - 4; i += 4) {
+      out[tmpoutpos] = bem[i] & 0xFF;
+      out[tmpoutpos] |= (bem[i + 1] & 0xFF) << 8;
+      out[tmpoutpos] |= (bem[i + 2] & 0xFF) << 16;
+      out[tmpoutpos] |= (bem[i + 3] & 0xFF) << 24;
+      tmpoutpos++;
+    }
+
+    int bitmap = 0;
+    for (int k = 2; k <= 32; ++k) {
+      if (dataPointers[k] != 0)
+        bitmap |= (1 << (k - 1));
+    }
+    out[tmpoutpos++] = bitmap;
+
+    for (int k = 2; k <= 32; ++k) {
+      if (dataPointers[k] != 0) {
+        out[tmpoutpos++] = dataPointers[k]; // size
+        int j = 0;
+        int n = (dataPointers[k] / BLOCK_SIZE) * BLOCK_SIZE;
+        for (; j < n; j += BLOCK_SIZE) {
+          VectorBitPacker.fastpackNoMask(dataTobePacked[k], j, out, tmpoutpos,
+                                         k);
+          tmpoutpos += INTS_PER_BLOCK * k;
+        }
+        int r = dataPointers[k] % BLOCK_SIZE;
+        if (r != 0) {
+          tmpoutpos = VectorBitPacker.slowpack(dataTobePacked[k], j, r, out,
+                                               tmpoutpos, k);
+          tmpoutpos++;
+        }
+      }
+    }
+    outpos.set(tmpoutpos);
+  }
+
+  /**
+   * Uncompress data in blocks of integers. In this particular case,
+   * the inlength parameter is ignored: it is deduced from the compressed
+   * data.
+   *
+   * @see IntegerCODEC#compress(int[], IntWrapper, int, int[], IntWrapper)
+   */
+  @Override
+  public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
+                                 int[] out, IntWrapper outpos, int mynvalue) {
+    mynvalue = mynvalue - mynvalue % BLOCK_SIZE;
+    int finalout = outpos.get() + mynvalue;
+    while (outpos.get() != finalout) {
+      int thissize = Math.min(pageSize, finalout - outpos.get());
+      decodePage(in, inpos, out, outpos, thissize);
+    }
+  }
+
+  @Override
+  public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+    throw new UnsupportedOperationException("Calculating the max compressed length is not supported yet.");
+  }
+
+  private void loadMetaData(int[] in, int inexcept, int bytesize) {
+    // Arrays.fill(bem, (byte)0);
+    int len = (bytesize + 3) / 4;
+    int lc = 0;
+    for (int i = 0; i < len; i++) {
+      bem[lc++] = (byte)(in[inexcept + i]);
+      bem[lc++] = (byte)(in[inexcept + i] >>> 8);
+      bem[lc++] = (byte)(in[inexcept + i] >>> 16);
+      bem[lc++] = (byte)(in[inexcept + i] >>> 24);
+    }
+  }
+
+  private void decodePage(int[] in, IntWrapper inpos, int[] out,
+                          IntWrapper outpos, int thissize) {
+    final int initpos = inpos.get();
+    final int wheremeta = in[inpos.get()];
+    inpos.increment();
+    int inexcept = initpos + wheremeta;
+
+    final int bytesize = in[inexcept++];
+    loadMetaData(in, inexcept, bytesize);
+    inexcept += (bytesize + 3) / 4;
+    final int bitmap = in[inexcept++];
+    for (int k = 2; k <= 32; ++k) {
+      if ((bitmap & (1 << (k - 1))) != 0) {
+        int size = in[inexcept++];
+        int val = size + BLOCK_SIZE - 1;
+        int roundedup = val - val % BLOCK_SIZE;
+        if (dataTobePacked[k].length < roundedup)
+          dataTobePacked[k] = new int[roundedup];
+        if (inexcept + roundedup / 32 * k <= in.length) {
+          int j = 0;
+          int len = (size / BLOCK_SIZE) * BLOCK_SIZE;
+          for (; j < len; j += BLOCK_SIZE) {
+            VectorBitPacker.fastunpack(in, inexcept, dataTobePacked[k], j, k);
+            inexcept += INTS_PER_BLOCK * k;
+          }
+          int r = size % BLOCK_SIZE;
+          inexcept = VectorBitPacker.slowunpack(in, inexcept, dataTobePacked[k],
+                                                j, r, k);
+        } else {
+          int j = 0;
+          val = roundedup / 32 * k + BLOCK_SIZE - 1;
+          int[] buf = new int[val - val % BLOCK_SIZE];
+          int initinexcept = inexcept;
+          System.arraycopy(in, inexcept, buf, 0, in.length - inexcept);
+          int l = (size / BLOCK_SIZE) * BLOCK_SIZE;
+          for (; j < l; j += BLOCK_SIZE) {
+            VectorBitPacker.fastunpack(buf, inexcept - initinexcept,
+                                       dataTobePacked[k], j, k);
+            inexcept += INTS_PER_BLOCK * k;
+          }
+          int r = size % BLOCK_SIZE;
+          inexcept = VectorBitPacker.slowunpack(in, inexcept, dataTobePacked[k],
+                                                j, r, k);
+        }
+      }
+    }
+    Arrays.fill(dataPointers, 0);
+    int tmpoutpos = outpos.get();
+    int tmpinpos = inpos.get();
+    int idx = 0;
+    for (int run = 0, run_end = thissize / BLOCK_SIZE; run < run_end;
+         ++run, tmpoutpos += BLOCK_SIZE) {
+      final int b = bem[idx];                  // byteContainer.get();
+      final int cexcept = bem[idx + 1] & 0xFF; // byteContainer.get() & 0xFF;
+      VectorBitPacker.fastunpack(in, tmpinpos, out, tmpoutpos, b);
+      tmpinpos += INTS_PER_BLOCK * b;
+      if (cexcept > 0) {
+        final int maxbits = bem[idx + 2]; // byteContainer.get();
+        idx += 3;
+        final int index = maxbits - b;
+        if (index == 1) {
+          for (int k = 0; k < cexcept; ++k) {
+            final int pos = bem[idx++] & 0xFF; // byteContainer.get() & 0xFF;
+            out[pos + tmpoutpos] |= 1 << b;
+          }
+        } else {
+          for (int k = 0; k < cexcept; ++k) {
+            final int pos = bem[idx++] & 0xFF; // byteContainer.get() & 0xFF;
+            final int exceptvalue =
+                dataTobePacked[index][dataPointers[index]++];
+            out[pos + tmpoutpos] |= exceptvalue << b;
+          }
+        }
+      } else {
+        idx += 2;
+      }
+    }
+    outpos.set(tmpoutpos);
+    inpos.set(inexcept);
+  }
+
+  @Override
+  public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
+                       IntWrapper outpos) {
+    inlength = inlength - inlength % BLOCK_SIZE;
+    if (inlength == 0)
+      return;
+    out[outpos.get()] = inlength;
+    outpos.increment();
+    headlessCompress(in, inpos, inlength, out, outpos);
+  }
+
+  @Override
+  public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
+                         IntWrapper outpos) {
+    if (inlength == 0)
+      return;
+    final int outlength = in[inpos.get()];
+    inpos.increment();
+    headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+  }
+  @Override
+  public String toString() {
+    return this.getClass().getSimpleName();
+  }
+
+  /**
+   * Creates a new buffer of the requested size.
+   *
+   * In case you need a different way to allocate buffers, you can override this
+   * method with a custom behavior. The default implementation allocates a new
+   * Java direct
+   * {@link ByteBuffer} on each invocation.
+   */
+  protected ByteBuffer makeBuffer(int sizeInBytes) {
+    return ByteBuffer.allocateDirect(sizeInBytes);
+  }
+}
diff --git a/src/main/java/me/lemire/longcompression/ByteLongCODEC.java b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java
new file mode 100644
index 0000000..dbc6864
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java
@@ -0,0 +1,62 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Interface describing a CODEC to compress longs to bytes.
+ * 
+ * @author Benoit Lacelle
+ * 
+ */
+public interface ByteLongCODEC {
+        /**
+         * Compress data from an array to another array.
+         * 
+         * Both inpos and outpos are modified to represent how much data was
+         * read and written to. If 12 longs (inlength = 12) are compressed to 3
+         * bytes, then inpos will be incremented by 12 while outpos will be
+         * incremented by 3. We use IntWrapper to pass the values by reference.
+         * 
+         * @param in
+         *                input array
+         * @param inpos
+         *                location in the input array
+         * @param inlength
+         *                how many longs to compress
+         * @param out
+         *                output array
+         * @param outpos
+         *                where to write in the output array
+         */
+        public void compress(long[] in, IntWrapper inpos, int inlength,
+                byte[] out, IntWrapper outpos);
+
+        /**
+         * Uncompress data from an array to another array.
+         * 
+         * Both inpos and outpos parameters are modified to indicate new
+         * positions after read/write.
+         * 
+         * @param in
+         *                array containing data in compressed form
+         * @param inpos
+         *                where to start reading in the array
+         * @param inlength
+         *                length of the compressed data (ignored by some
+         *                schemes)
+         * @param out
+         *                array where to write the compressed output
+         * @param outpos
+         *                where to write the compressed output in out
+         */
+        public void uncompress(byte[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos);
+
+}
diff --git a/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java b/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java
new file mode 100644
index 0000000..b21ef68
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/IntegratedLongCODEC.java
@@ -0,0 +1,11 @@
+package me.lemire.longcompression;
+
+/**
+ * This is just like LongCODEC, except that it indicates that delta coding is
+ * "integrated", so that you don't need a separate step for delta coding.
+ * 
+ * @author Benoit Lacelle
+ */
+public interface IntegratedLongCODEC extends LongCODEC {
+
+}
diff --git a/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java
new file mode 100644
index 0000000..35c1166
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java
@@ -0,0 +1,189 @@
+package me.lemire.longcompression;
+
+import java.util.Arrays;
+
+import me.lemire.integercompression.BinaryPacking;
+import me.lemire.integercompression.Composition;
+import me.lemire.integercompression.IntCompressor;
+import me.lemire.integercompression.IntWrapper;
+import me.lemire.integercompression.IntegerCODEC;
+import me.lemire.integercompression.VariableByte;
+
+/**
+ * A {@link LongCODEC} which split each long in a highpart (32 first bits) and a low part (32 last bits).
+ * 
+ * @author Benoit Lacelle
+ *
+ */
+public class LongAs2IntsCodec implements LongCODEC {
+    final IntegerCODEC highPartsCodec;
+    final IntegerCODEC lowPartsCodec;
+
+    public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) {
+        this.highPartsCodec = highPartsCodec;
+        this.lowPartsCodec = lowPartsCodec;
+    }
+
+    /**
+     * By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive
+     * integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC
+     */
+    public LongAs2IntsCodec() {
+        this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte()));
+    }
+
+    @Override
+    public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) {
+        if (inlength == 0) {
+            return;
+        }
+        
+        int[] highParts = new int[inlength];
+        int[] lowParts = new int[inlength];
+
+        for (int i = 0; i < inlength; i++) {
+            int inPosition = inpos.get() + i;
+
+            highParts[i] = RoaringIntPacking.high(in[inPosition]);
+            lowParts[i] = RoaringIntPacking.low(in[inPosition]);
+        }
+
+        // TODO What would be a relevant buffer size?
+        int[] buffer = new int[inlength * 16];
+
+        int outPosition = outpos.get();
+
+        boolean hasLeftover;
+        {
+            // The first integer is reserved to hold the number of compressed ints
+            IntWrapper highPartsOutPosition = new IntWrapper(1);
+
+            highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition);
+
+            // Record the compressedHighparts length
+            buffer[0] = highPartsOutPosition.get() - 1;
+
+            for (int i = 0; i < highPartsOutPosition.get() / 2; i++) {
+                long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]);
+                out[outPosition++] = pack;
+            }
+
+            if (1 == highPartsOutPosition.get() % 2) {
+                // Shift the trailing integer as first in the buffer
+                hasLeftover = true;
+                buffer[0] = buffer[highPartsOutPosition.get() - 1];
+            } else {
+                hasLeftover = false;
+            }
+        }
+
+        {
+            // The first integer is reserved to hold the number of compressed ints
+            IntWrapper lowPartsOutPosition = new IntWrapper(1);
+            if (hasLeftover) {
+                // Keep the trailing int from highParts before the reserved int from lowParts compressed length
+                lowPartsOutPosition.set(2);
+            }
+
+            lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition);
+
+            // Record the compressedHighparts length
+            buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1);
+
+            for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) {
+                long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]);
+                out[outPosition++] = pack;
+            }
+
+            if (1 == lowPartsOutPosition.get() % 2) {
+                // The trailing integer is packed with a 0
+                long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0);
+                out[outPosition++] = pack;
+            }
+        }
+
+        inpos.add(inlength);
+        outpos.set(outPosition);
+    }
+
+    /**
+     * inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length
+     */
+    @Override
+    public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) {
+        if (inlength == 0) {
+            return;
+        }
+
+        int longIndex = inpos.get();
+
+        int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]);
+        int[] compressedHighParts = new int[nbCompressedHighParts];
+
+        // !highPart as we just read the highPart for nbCompressedHighParts
+        boolean highPart = false;
+        for (int i = 0; i < nbCompressedHighParts; i++) {
+            int nextInt;
+            if (highPart) {
+                nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]);
+            } else {
+                nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]);
+            }
+            compressedHighParts[i] = nextInt;
+
+            highPart = !highPart;
+        }
+
+        // TODO What would be a relevant buffer size?
+        int[] buffer = new int[inlength * 16];
+
+        IntWrapper highPartsOutPosition = new IntWrapper();
+        highPartsCodec.uncompress(compressedHighParts,
+                new IntWrapper(),
+                compressedHighParts.length,
+                buffer,
+                highPartsOutPosition);
+        int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get());
+
+        // +1 as we initially read nbCompressedHighParts
+        int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts;
+        int nbCompressedLowParts;
+        if (highPart) {
+            nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]);
+        } else {
+            nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]);
+        }
+        highPart = !highPart;
+
+        int[] compressedLowParts = new int[nbCompressedLowParts];
+        for (int i = 0; i < nbCompressedLowParts; i++) {
+            int nextInt;
+            if (highPart) {
+                nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]);
+            } else {
+                nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]);
+            }
+            compressedLowParts[i] = nextInt;
+
+            highPart = !highPart;
+        }
+
+        IntWrapper lowPartsOutPosition = new IntWrapper();
+        lowPartsCodec.uncompress(compressedLowParts,
+                new IntWrapper(),
+                compressedLowParts.length,
+                buffer,
+                lowPartsOutPosition);
+        int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get());
+        assert highParts.length == lowParts.length;
+
+        int outposition = outpos.get();
+        for (int i = 0; i < highParts.length; i++) {
+            out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]);
+        }
+
+        inpos.add(inlength);
+        outpos.set(outposition);
+    }
+
+}
diff --git a/src/main/java/me/lemire/longcompression/LongBinaryPacking.java b/src/main/java/me/lemire/longcompression/LongBinaryPacking.java
new file mode 100644
index 0000000..b6ea58f
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongBinaryPacking.java
@@ -0,0 +1,153 @@
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.BinaryPacking;
+import me.lemire.integercompression.IntWrapper;
+import me.lemire.integercompression.Util;
+
+/**
+ * Scheme  based on a commonly used idea: can be extremely fast.
+ * It encodes integers in blocks of 64 longs. For arrays containing
+ * an arbitrary number of longs, you should use it in conjunction
+ * with another CODEC: 
+ * 
+ *  <pre>LongCODEC ic = 
+ *  new Composition(new LongBinaryPacking(), new LongVariableByte()).</pre>
+ * 
+ * Note that this does not use differential coding: if you are working on sorted
+ * lists, you must compute the deltas separately.
+ *
+ * <p>
+ * For details, please see {@link BinaryPacking}
+ * </p>
+ * 
+ * @author Benoit Lacelle
+ */
+public final class LongBinaryPacking implements LongCODEC, SkippableLongCODEC {
+        public final static int BLOCK_SIZE = 64;
+        private static final int MAX_BIT_WIDTH = Long.SIZE;
+
+        @Override
+        public void compress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+            inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+            if (inlength == 0)
+                    return;
+            out[outpos.get()] = inlength;
+            outpos.increment();
+            headlessCompress(in, inpos, inlength, out, outpos);
+        }
+
+        @Override
+        public void headlessCompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+            inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
+            int tmpoutpos = outpos.get();
+            int s = inpos.get();
+            // Compress by block of 8 * 64 longs as much as possible
+            for (; s + BLOCK_SIZE * 8 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 8) {
+                // maxbits can be anything between 0 and 64 included: expressed within a byte (1 << 6)
+                final int mbits1 = LongUtil.maxbits(in, s + 0 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits2 = LongUtil.maxbits(in, s + 1 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits3 = LongUtil.maxbits(in, s + 2 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits4 = LongUtil.maxbits(in, s + 3 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits5 = LongUtil.maxbits(in, s + 4 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits6 = LongUtil.maxbits(in, s + 5 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits7 = LongUtil.maxbits(in, s + 6 * BLOCK_SIZE, BLOCK_SIZE);
+                final int mbits8 = LongUtil.maxbits(in, s + 7 * BLOCK_SIZE, BLOCK_SIZE);
+                // The first long expressed the maxbits for the 8 buckets
+                out[tmpoutpos++] = ((long) mbits1 << 56) | ((long) mbits2 << 48) | ((long) mbits3 << 40) | ((long) mbits4 << 32) | (mbits5 << 24) | (mbits6 << 16) | (mbits7 << 8) | (mbits8);
+                LongBitPacking.fastpackwithoutmask(in, s + 0 * BLOCK_SIZE, out, tmpoutpos, (int) mbits1);
+                tmpoutpos += mbits1;
+                LongBitPacking.fastpackwithoutmask(in, s + 1 * BLOCK_SIZE, out, tmpoutpos, (int) mbits2);
+                tmpoutpos += mbits2;
+                LongBitPacking.fastpackwithoutmask(in, s + 2 * BLOCK_SIZE, out, tmpoutpos, (int) mbits3);
+                tmpoutpos += mbits3;
+                LongBitPacking.fastpackwithoutmask(in, s + 3 * BLOCK_SIZE, out, tmpoutpos, (int) mbits4);
+                tmpoutpos += mbits4;
+                LongBitPacking.fastpackwithoutmask(in, s + 4 * BLOCK_SIZE, out, tmpoutpos, (int) mbits5);
+                tmpoutpos += mbits5;
+                LongBitPacking.fastpackwithoutmask(in, s + 5 * BLOCK_SIZE, out, tmpoutpos, (int) mbits6);
+                tmpoutpos += mbits6;
+                LongBitPacking.fastpackwithoutmask(in, s + 6 * BLOCK_SIZE, out, tmpoutpos, (int) mbits7);
+                tmpoutpos += mbits7;
+                LongBitPacking.fastpackwithoutmask(in, s + 7 * BLOCK_SIZE, out, tmpoutpos, (int) mbits8);
+                tmpoutpos += mbits8;
+            }
+            // Then we compress up to 7 blocks of 64 longs
+            for (; s < inpos.get() + inlength; s += BLOCK_SIZE ) {
+                final int mbits = LongUtil.maxbits(in, s, BLOCK_SIZE);
+                out[tmpoutpos++] = mbits;
+                LongBitPacking.fastpackwithoutmask(in, s, out, tmpoutpos, mbits);
+                tmpoutpos += mbits;
+            }
+            inpos.add(inlength);
+            outpos.set(tmpoutpos);
+        }
+
+        @Override
+        public void uncompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+                if (inlength == 0)
+                        return;
+                final int outlength = (int) in[inpos.get()];
+                inpos.increment();
+                headlessUncompress(in,inpos, inlength,out,outpos,outlength);
+        }
+
+        @Override
+        public void headlessUncompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos, int num) {
+            final int outlength = Util.greatestMultiple(num, BLOCK_SIZE);
+            int tmpinpos = inpos.get();
+            int s = outpos.get();
+            for (; s + BLOCK_SIZE * 8 - 1 < outpos.get() + outlength; s += BLOCK_SIZE * 8) {
+                final int mbits1 = (int) ((in[tmpinpos] >>> 56));
+                final int mbits2 = (int) ((in[tmpinpos] >>> 48) & 0xFF);
+                final int mbits3 = (int) ((in[tmpinpos] >>> 40) & 0xFF);
+                final int mbits4 = (int) ((in[tmpinpos] >>> 32) & 0xFF);
+                final int mbits5 = (int) ((in[tmpinpos] >>> 24) & 0xFF);
+                final int mbits6 = (int) ((in[tmpinpos] >>> 16) & 0xFF);
+                final int mbits7 = (int) ((in[tmpinpos] >>> 8) & 0xFF);
+                final int mbits8 = (int) ((in[tmpinpos]) & 0xFF);
+                ++tmpinpos;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 0 * BLOCK_SIZE, mbits1);
+                tmpinpos += mbits1;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 1 * BLOCK_SIZE, mbits2);
+                tmpinpos += mbits2;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 2 * BLOCK_SIZE, mbits3);
+                tmpinpos += mbits3;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 3 * BLOCK_SIZE, mbits4);
+                tmpinpos += mbits4;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 4 * BLOCK_SIZE, mbits5);
+                tmpinpos += mbits5;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 5 * BLOCK_SIZE, mbits6);
+                tmpinpos += mbits6;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 6 * BLOCK_SIZE, mbits7);
+                tmpinpos += mbits7;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s + 7 * BLOCK_SIZE, mbits8);
+                tmpinpos += mbits8;
+            }
+            for (; s < outpos.get() + outlength; s += BLOCK_SIZE ) {
+                final int mbits = (int) in[tmpinpos];
+                ++tmpinpos;
+                LongBitPacking.fastunpack(in, tmpinpos, out, s, mbits);
+                tmpinpos += mbits;
+            }
+            outpos.add(outlength);
+            inpos.set(tmpinpos);
+        }
+        
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+            int blockCount = inlength / BLOCK_SIZE;
+            int headersSizeInLongs = blockCount / Long.BYTES + (blockCount % Long.BYTES);
+            int blocksSizeInLongs = blockCount * MAX_BIT_WIDTH;
+            compressedPositions.add(blockCount * BLOCK_SIZE);
+            return headersSizeInLongs + blocksSizeInLongs;
+        }
+
+        @Override
+        public String toString() {
+                return this.getClass().getSimpleName();
+        }
+}
diff --git a/src/main/java/me/lemire/longcompression/LongBitPacking.java b/src/main/java/me/lemire/longcompression/LongBitPacking.java
new file mode 100644
index 0000000..2d282ec
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongBitPacking.java
@@ -0,0 +1,146 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import java.util.Arrays;
+
+/**
+ * Bitpacking routines
+ * 
+ * <p>For details, please see</p>
+ * <p>
+ * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second
+ * through vectorization Software: Practice &amp; Experience
+ * <a href="http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract">http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract</a>
+ * <a href="http://arxiv.org/abs/1209.2137">http://arxiv.org/abs/1209.2137</a>
+ * </p>
+ * 
+ * @author Benoit Lacelle
+ * 
+ */
+public final class LongBitPacking {
+
+        /**
+         * Pack 64 longs
+         * 
+         * @param in
+         *                source array
+         * @param inpos
+         *                position in source array
+         * @param out
+         *                output array
+         * @param outpos
+         *                position in output array
+         * @param bit
+         *                number of bits to use per long
+         */
+        public static void fastpackwithoutmask(final long[] in, final int inpos,
+                final long[] out, final int outpos, final int bit) {
+                if (bit == 0) {
+                    fastpackwithoutmask0(in, inpos, out, outpos);
+                } else if (bit == 64) {
+                    fastpackwithoutmask64(in, inpos, out, outpos);
+                }  else if (bit > 0 && bit < 64) {
+                    slowpackwithoutmask(in, inpos, out, outpos, bit);
+                } else {
+                    throw new IllegalArgumentException("Unsupported bit width: " + bit);
+                }
+        }
+
+        protected static void fastpackwithoutmask0(final long[] in, int inpos,
+                final long[] out, int outpos) {
+                // nothing
+        }
+
+        protected static void fastpackwithoutmask64(final long[] in, int inpos,
+                final long[] out, int outpos) {
+                System.arraycopy(in, inpos, out, outpos, 64);
+        }
+
+        protected static void slowpackwithoutmask(final long[] in, int inpos,
+                final long[] out, int outpos, final int bit) {
+                int bucket = 0;
+                int shift = 0;
+                
+                out[outpos + bucket] = 0L;
+                for (int i = 0 ; i < 64 ; i++) {
+                    if (shift >= 64) {
+                        bucket++;
+                        out[bucket + outpos] = 0L;
+                        shift -= 64;
+
+                        if (shift > 0) {
+                            // There is some leftovers from previous input in the next bucket
+                            out[outpos + bucket] |= in[inpos + i - 1] >> (bit - shift);
+                        }
+                    }
+                    out[outpos + bucket] |= in[inpos + i] << shift;
+                    
+                    shift += bit;
+                }
+        }
+
+
+        /**
+         * Unpack the 64 longs
+         * 
+         * @param in
+         *                source array
+         * @param inpos
+         *                starting point in the source array
+         * @param out
+         *                output array
+         * @param outpos
+         *                starting point in the output array
+         * @param bit
+         *                how many bits to use per integer
+         */
+        public static void fastunpack(final long[] in, final int inpos,
+                final long[] out, final int outpos, final int bit) {
+                if (bit == 0) {
+                    fastunpack0(in, inpos, out, outpos);
+                } else if (bit == 64) {
+                    fastunpack64(in, inpos, out, outpos);
+                } else if (bit > 0 && bit < 64) {
+                    slowunpack(in, inpos, out, outpos, bit);
+                } else {
+                    throw new IllegalArgumentException("Unsupported bit width: " + bit);
+                }
+        }
+
+
+        protected static void fastunpack0(final long[] in, int inpos,
+                final long[] out, int outpos) {
+                Arrays.fill(out, outpos, outpos + 64, 0);
+        }
+        
+        protected static void fastunpack64(final long[] in, int inpos,
+                final long[] out, int outpos) {
+                System.arraycopy(in, inpos, out, outpos, 64);
+        }
+
+        protected static void slowunpack(final long[] in, int inpos,
+                final long[] out, int outpos, final int bit) {
+                int bucket = 0;
+                int shift = 0;
+                for (int i = 0 ; i < 64 ; i++) {
+                    if (shift >= 64) {
+                        bucket++;
+                        shift -= 64;
+
+                        if (shift > 0) {
+                            // There is some leftovers from previous input in the next bucket
+                            out[outpos + i - 1] |= (in[inpos + bucket] << (bit - shift) & ((1L << bit) - 1));
+                        }
+                    }
+                    out[outpos + i] = ((in[inpos + bucket] >>> shift) & ((1L << bit) - 1));
+                    
+                    shift += bit;
+                }
+        }
+}
diff --git a/src/main/java/me/lemire/longcompression/LongCODEC.java b/src/main/java/me/lemire/longcompression/LongCODEC.java
new file mode 100644
index 0000000..0951ffd
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongCODEC.java
@@ -0,0 +1,62 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Interface describing a standard CODEC to compress longs.
+ * 
+ * @author Benoit Lacelle
+ * 
+ */
+public interface LongCODEC {
+        /**
+         * Compress data from an array to another array.
+         * 
+         * Both inpos and outpos are modified to represent how much data was
+         * read and written to. If 12 longs (inlength = 12) are compressed to 3
+         * longs, then inpos will be incremented by 12 while outpos will be
+         * incremented by 3. We use IntWrapper to pass the values by reference.
+         * 
+         * @param in
+         *                input array
+         * @param inpos
+         *                where to start reading in the array
+         * @param inlength
+         *                how many longs to compress
+         * @param out
+         *                output array
+         * @param outpos
+         *                where to write in the output array
+         */
+        public void compress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos);
+
+        /**
+         * Uncompress data from an array to another array.
+         * 
+         * Both inpos and outpos parameters are modified to indicate new
+         * positions after read/write.
+         * 
+         * @param in
+         *                array containing data in compressed form
+         * @param inpos
+         *                where to start reading in the array
+         * @param inlength
+         *                length of the compressed data (ignored by some
+         *                schemes)
+         * @param out
+         *                array where to write the uncompressed output
+         * @param outpos
+         *                where to start writing the uncompressed output in out
+         */
+        public void uncompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos);
+
+}
diff --git a/src/main/java/me/lemire/longcompression/LongComposition.java b/src/main/java/me/lemire/longcompression/LongComposition.java
new file mode 100644
index 0000000..5111a51
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongComposition.java
@@ -0,0 +1,71 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Helper class to compose schemes.
+ * 
+ * @author Benoit Lacelle
+ */
+public class LongComposition implements LongCODEC {
+        LongCODEC F1, F2;
+
+        /**
+         * Compose a scheme from a first one (f1) and a second one (f2). The
+         * first one is called first and then the second one tries to compress
+         * whatever remains from the first run.
+         * 
+         * By convention, the first scheme should be such that if, during
+         * decoding, a 32-bit zero is first encountered, then there is no
+         * output.
+         * 
+         * @param f1
+         *                first codec
+         * @param f2
+         *                second codec
+         */
+        public LongComposition(LongCODEC f1, LongCODEC f2) {
+                F1 = f1;
+                F2 = f2;
+        }
+
+        @Override
+        public void compress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+            if (inlength == 0) {
+                return;
+            }
+            int inposInit = inpos.get();
+            int outposInit = outpos.get();
+            F1.compress(in, inpos, inlength, out, outpos);
+            if (outpos.get() == outposInit) {
+                out[outposInit] = 0;
+                outpos.increment();
+            }
+            inlength -= inpos.get() - inposInit;
+            F2.compress(in, inpos, inlength, out, outpos);
+        }
+
+        @Override
+        public void uncompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+                if (inlength == 0)
+                        return;
+                final int init = inpos.get();
+                F1.uncompress(in, inpos, inlength, out, outpos);
+                inlength -= inpos.get() - init;
+                F2.uncompress(in, inpos, inlength, out, outpos);
+        }
+
+        @Override
+        public String toString() {
+                return F1.toString() + " + " + F2.toString();
+        }
+
+}
diff --git a/src/main/java/me/lemire/longcompression/LongCompressor.java b/src/main/java/me/lemire/longcompression/LongCompressor.java
new file mode 100644
index 0000000..246647f
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongCompressor.java
@@ -0,0 +1,68 @@
+package me.lemire.longcompression;
+
+import java.util.Arrays;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * This is a convenience class that wraps a codec to provide
+ * a "friendly" API.
+ *
+ * @author Benoit Lacelle
+ */
+public class LongCompressor {
+
+    SkippableLongCODEC codec;
+    
+    /**
+     * Constructor wrapping a codec.
+     * 
+     * @param c the underlying codec
+     */
+    public LongCompressor(SkippableLongCODEC c) {
+      codec = c;
+    }
+    
+    /**
+     * Constructor with default codec.
+     */
+    public LongCompressor() {
+        codec = new SkippableLongComposition(new LongBinaryPacking(),
+                new LongVariableByte());
+    }
+
+    /**
+     * Compress an array and returns the compressed result as a new array.
+     * 
+     * @param input array to be compressed
+     * @return compressed array
+     */
+    public  long[] compress(long[] input) {
+        int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length);
+        long[] compressed = new long[maxCompressedLength + 1]; // +1 to store the length of the input
+        // Store at index=0 the length of the input, hence enabling .headlessCompress
+        compressed[0] = input.length;
+        IntWrapper outpos = new IntWrapper(1);
+        codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos);
+        compressed = Arrays.copyOf(compressed,outpos.intValue());
+        return compressed;
+    }
+
+    /**
+     * Uncompress an array and returns the uncompressed result as a new array.
+     * 
+     * @param compressed compressed array
+     * @return uncompressed array
+     */
+    public long[] uncompress(long[] compressed) {
+        // Read at index=0 the length of the input, hence enabling .headlessUncompress
+    	long[] decompressed = new long[(int) compressed[0]];
+        IntWrapper inpos = new IntWrapper(1);
+        codec.headlessUncompress(compressed, inpos, 
+                compressed.length - inpos.intValue(), 
+                decompressed, new IntWrapper(0), 
+                decompressed.length);
+        return decompressed;
+    }
+
+}
diff --git a/src/main/java/me/lemire/longcompression/LongJustCopy.java b/src/main/java/me/lemire/longcompression/LongJustCopy.java
new file mode 100644
index 0000000..95abc1e
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongJustCopy.java
@@ -0,0 +1,58 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * @author Benoit lacelle
+ * 
+ */
+public final class LongJustCopy implements LongCODEC, SkippableLongCODEC {
+
+        @Override
+        public void headlessCompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+                System.arraycopy(in, inpos.get(), out, outpos.get(), inlength);
+                inpos.add(inlength);
+                outpos.add(inlength);
+        }
+
+        @Override
+        public void uncompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+            headlessUncompress(in,inpos,inlength,out,outpos,inlength);
+        }
+
+        @Override
+        public String toString() {
+                return this.getClass().getSimpleName();
+        }
+
+        @Override
+        public void headlessUncompress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos, int num) {
+            System.arraycopy(in, inpos.get(), out, outpos.get(), num);
+            inpos.add(num);
+            outpos.add(num);
+            
+        }
+
+        @Override
+        public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+            compressedPositions.add(inlength);
+            return inlength;
+        }
+
+        @Override
+        public void compress(long[] in, IntWrapper inpos, int inlength,
+                long[] out, IntWrapper outpos) {
+            headlessCompress(in,inpos,inlength,out,outpos);
+        }
+
+}
diff --git a/src/main/java/me/lemire/longcompression/LongUtil.java b/src/main/java/me/lemire/longcompression/LongUtil.java
new file mode 100644
index 0000000..7bdce83
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongUtil.java
@@ -0,0 +1,52 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+/**
+ * These are unofficial helpers related to long compression
+ * 
+ * @author Benoit Lacelle
+ *
+ */
+@Deprecated
+public class LongUtil {
+    
+    /**
+     * Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range
+     * of value
+     * 
+     * @param i
+     *            source array
+     * @param pos
+     *            starting position
+     * @param length
+     *            number of integers to consider
+     * @return integer logarithm
+     */
+    public static int maxbits(long[] i, int pos, int length) {
+        long mask = 0;
+        for (int k = pos; k < pos + length; ++k)
+            mask |= i[k];
+        return bits(mask);
+    }
+
+    /**
+     * Compute the integer logarithms (ceil(log(x+1)) of a value
+     * 
+     * @param i
+     *            source value
+     * @return integer logarithm
+     */
+    public static int bits(long i) {
+        return 64 - Long.numberOfLeadingZeros(i);
+    }
+    
+    protected static String longToBinaryWithLeading(long l) {
+        return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0');
+    }
+}
diff --git a/src/main/java/me/lemire/longcompression/LongVariableByte.java b/src/main/java/me/lemire/longcompression/LongVariableByte.java
new file mode 100644
index 0000000..63c194b
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/LongVariableByte.java
@@ -0,0 +1,348 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+package me.lemire.longcompression;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.LongBuffer;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Implementation of variable-byte. For best performance, use it using the
+ * ByteLongCODEC interface.
+ * 
+ * Note that this does not use differential coding: if you are working on sorted
+ * lists, you must compute the deltas separately.
+ * 
+ * @author Benoit Lacelle
+ */
+public class LongVariableByte implements LongCODEC, ByteLongCODEC, SkippableLongCODEC {
+    private static final int MAX_BYTES_PER_INT = 10;
+
+    private static byte extract7bits(int i, long val) {
+        return (byte) ((val >>> (7 * i)) & ((1 << 7) - 1));
+    }
+
+    private static byte extract7bitsmaskless(int i, long val) {
+        return (byte) ((val >>> (7 * i)));
+    }
+    @Override
+    public void compress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos) {
+        headlessCompress(in, inpos, inlength, out, outpos);
+    }
+
+    @Override
+    public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        // Worst case: we write 10 bytes per long, hence 2 longs for a long, hence 16 bytes per long
+        ByteBuffer buf = makeBuffer(inlength * 16);
+        buf.order(ByteOrder.LITTLE_ENDIAN);
+        for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
+            final long val = in[k];
+            if (val >= 0 && val < (1 << 7)) {
+                buf.put((byte) (val | (1 << 7)));
+            } else if (val >= 0 && val < (1 << 14)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) (extract7bitsmaskless(1, (val)) | (1 << 7)));
+            } else if (val >= 0 && val < (1 << 21)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) (extract7bitsmaskless(2, (val)) | (1 << 7)));
+            } else if (val >= 0 && val < (1 << 28)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) (extract7bitsmaskless(3, (val)) | (1 << 7)));
+            } else if (val >= 0 && val < (1L << 35)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) extract7bits(3, val));
+                buf.put((byte) (extract7bitsmaskless(4, (val)) | (1 << 7)));
+            } else if (val >= 0 && val < (1L << 42)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) extract7bits(3, val));
+                buf.put((byte) extract7bits(4, val));
+                buf.put((byte) (extract7bitsmaskless(5, (val)) | (1 << 7)));
+            } else if (val >= 0 && val < (1L << 49)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) extract7bits(3, val));
+                buf.put((byte) extract7bits(4, val));
+                buf.put((byte) extract7bits(5, val));
+                buf.put((byte) (extract7bitsmaskless(6, (val)) | (1 << 7)));
+            } else if (val >= 0 && val < (1L << 56)) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) extract7bits(3, val));
+                buf.put((byte) extract7bits(4, val));
+                buf.put((byte) extract7bits(5, val));
+                buf.put((byte) extract7bits(6, val));
+                buf.put((byte) (extract7bitsmaskless(7, (val)) | (1 << 7)));
+            } else if (val >= 0) {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) extract7bits(3, val));
+                buf.put((byte) extract7bits(4, val));
+                buf.put((byte) extract7bits(5, val));
+                buf.put((byte) extract7bits(6, val));
+                buf.put((byte) extract7bits(7, val));
+                buf.put((byte) (extract7bitsmaskless(8, (val)) | (1 << 7)));
+            } else {
+                buf.put((byte) extract7bits(0, val));
+                buf.put((byte) extract7bits(1, val));
+                buf.put((byte) extract7bits(2, val));
+                buf.put((byte) extract7bits(3, val));
+                buf.put((byte) extract7bits(4, val));
+                buf.put((byte) extract7bits(5, val));
+                buf.put((byte) extract7bits(6, val));
+                buf.put((byte) extract7bits(7, val));
+                buf.put((byte) extract7bits(8, val));
+                buf.put((byte) (extract7bitsmaskless(9, (val)) | (1 << 7)));
+            }
+        }
+        while (buf.position() % 8 != 0)
+            buf.put((byte) 0);
+        final int length = buf.position();
+        buf.flip();
+        LongBuffer ibuf = buf.asLongBuffer();
+        ibuf.get(out, outpos.get(), length / 8);
+        outpos.add(length / 8);
+        inpos.add(inlength);
+    }
+
+    @Override
+    public void compress(long[] in, IntWrapper inpos, int inlength, byte[] out,
+            IntWrapper outpos) {
+        if (inlength == 0)
+            return;
+        int outpostmp = outpos.get();
+        for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
+            final long val = in[k];
+            if (val >= 0 && val < (1 << 7)) {
+                out[outpostmp++] = (byte) (val | (1 << 7));
+            } else if (val >= 0 && val < (1 << 14)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(1, (val)) | (1 << 7));
+            } else if (val >= 0 && val < (1 << 21)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(2, (val)) | (1 << 7));
+            } else if (val >= 0 && val < (1 << 28)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(3, (val)) | (1 << 7));
+            } else if (val >= 0 && val < (1L << 35)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) extract7bits(3, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(4, (val)) | (1 << 7));
+            } else if (val >= 0 && val < (1L << 42)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) extract7bits(3, val);
+                out[outpostmp++] = (byte) extract7bits(4, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(5, (val)) | (1 << 7));
+            } else if (val >= 0 && val < (1L << 49)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) extract7bits(3, val);
+                out[outpostmp++] = (byte) extract7bits(4, val);
+                out[outpostmp++] = (byte) extract7bits(5, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(6, (val)) | (1 << 7));
+            } else if (val >= 0 && val < (1L << 56)) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) extract7bits(3, val);
+                out[outpostmp++] = (byte) extract7bits(4, val);
+                out[outpostmp++] = (byte) extract7bits(5, val);
+                out[outpostmp++] = (byte) extract7bits(6, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(7, (val)) | (1 << 7));
+            } else if (val >= 0) {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) extract7bits(3, val);
+                out[outpostmp++] = (byte) extract7bits(4, val);
+                out[outpostmp++] = (byte) extract7bits(5, val);
+                out[outpostmp++] = (byte) extract7bits(6, val);
+                out[outpostmp++] = (byte) extract7bits(7, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(8, (val)) | (1 << 7));
+            } else {
+                out[outpostmp++] = (byte) extract7bits(0, val);
+                out[outpostmp++] = (byte) extract7bits(1, val);
+                out[outpostmp++] = (byte) extract7bits(2, val);
+                out[outpostmp++] = (byte) extract7bits(3, val);
+                out[outpostmp++] = (byte) extract7bits(4, val);
+                out[outpostmp++] = (byte) extract7bits(5, val);
+                out[outpostmp++] = (byte) extract7bits(6, val);
+                out[outpostmp++] = (byte) extract7bits(7, val);
+                out[outpostmp++] = (byte) extract7bits(8, val);
+                out[outpostmp++] = (byte) (extract7bitsmaskless(9, (val)) | (1 << 7));
+            }
+        }
+        outpos.set(outpostmp);
+        inpos.add(inlength);
+    }
+
+    @Override
+    public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos) {
+        int s = 0;
+        long val = 0;
+        int p = inpos.get();
+        int finalp = inpos.get() + inlength;
+        int tmpoutpos = outpos.get();
+        for (long v = 0, shift = 0; p < finalp;) {
+            val = in[p];
+            long c = (byte) (val >>> s);
+            // Shift to next byte
+            s += 8;
+            // Shift to next long if s==64
+            p += s>>6;
+            // Cycle from 63 to 0
+            s = s & 63;
+            v += ((c & 127) << shift);
+            if ((c & 128) == 128) {
+                out[tmpoutpos++] = v;
+                v = 0;
+                shift = 0;
+            } else
+                shift += 7;
+            assert shift < 64;
+        }
+        outpos.set(tmpoutpos);
+        inpos.add(inlength);
+    }
+
+    @Override
+    public void uncompress(byte[] in, IntWrapper inpos, int inlength,
+            long[] out, IntWrapper outpos) {
+        int p = inpos.get();
+        int finalp = inpos.get() + inlength;
+        int tmpoutpos = outpos.get();
+        for (long v = 0; p < finalp; out[tmpoutpos++] = v) {
+            v = in[p] & 0x7F;
+            if (in[p] < 0) {
+                p += 1;
+                continue;
+            }
+            v = ((in[p + 1] & 0x7F) << 7) | v;
+            if (in[p + 1] < 0) {
+                p += 2;
+                continue;
+            }
+            v = ((in[p + 2] & 0x7F) << 14) | v;
+            if (in[p + 2] < 0 ) {
+                p += 3;
+                continue;
+            }
+            v = ((in[p + 3] & 0x7F) << 21) | v;
+            if (in[p + 3] < 0) {
+                p += 4;
+                continue;
+            }
+            v = (((long) in[p + 4] & 0x7F) << 28) | v;
+            if (in[p + 4] < 0) {
+                p += 5;
+                continue;
+            }
+            v = (((long) in[p + 5] & 0x7F) << 35) | v;
+            if (in[p + 5] < 0) {
+                p += 6;
+                continue;
+            }
+            v = (((long) in[p + 6] & 0x7F) << 42) | v;
+            if (in[p + 6] < 0) {
+                p += 7;
+                continue;
+            }
+            v = (((long) in[p + 7] & 0x7F) << 49) | v;
+            if (in[p + 7] < 0) {
+                p += 8;
+                continue;
+            }
+            v = (((long) in[p + 8] & 0x7F) << 56) | v;
+            if (in[p + 8] < 0) {
+                p += 9;
+                continue;
+            }
+            v = (((long) in[p + 9] & 0x7F) << 63) | v;
+            p += 10;
+        }
+        outpos.set(tmpoutpos);
+        inpos.add(p);
+    }
+
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
+
+    @Override
+    public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos, int num) {
+        int s = 0;
+        long val = 0;
+        int p = inpos.get();
+        int tmpoutpos = outpos.get();
+        int finaloutpos = num + tmpoutpos;
+        for (long v = 0, shift = 0; tmpoutpos < finaloutpos;) {
+            val = in[p];
+            long c = val >>> s;
+            // Shift to next byte
+            s += 8;
+            // Shift to next long if s == 64
+            p += s>>6;
+            // Cycle from 63 to 0
+            s = s & 63;
+            v += ((c & 127) << shift);
+            if ((c & 128) == 128) {
+                out[tmpoutpos++] = v;
+                v = 0;
+                shift = 0;
+            } else
+                shift += 7;
+            assert shift < 64;
+        }
+        outpos.set(tmpoutpos);
+        inpos.set(p + (s!=0 ? 1 : 0));
+    }
+
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+        int maxLengthInLongs = (maxLengthInBytes + Long.BYTES - 1) / Long.BYTES;
+        compressedPositions.add(inlength);
+        return maxLengthInLongs;
+    }
+
+    /**
+     * Creates a new buffer of the requested size.
+     *
+     * In case you need a different way to allocate buffers, you can override this method
+     * with a custom behavior. The default implementation allocates a new Java direct
+     * {@link ByteBuffer} on each invocation.
+     */
+    protected ByteBuffer makeBuffer(int sizeInBytes) {
+        return ByteBuffer.allocateDirect(sizeInBytes);
+    }
+}
diff --git a/src/main/java/me/lemire/longcompression/RoaringIntPacking.java b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java
new file mode 100644
index 0000000..d6b6baa
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java
@@ -0,0 +1,46 @@
+/*
+ * (c) the authors Licensed under the Apache License, Version 2.0.
+ */
+package me.lemire.longcompression;
+
+/**
+ * Used to hold the logic packing 2 integers in a long, and separating a long in two integers. It is
+ * useful in {@link Roaring64NavigableMap} as the implementation split the input long in two
+ * integers, one used as key of a NavigableMap while the other is added in a Bitmap
+ * 
+ * @author Benoit Lacelle
+ *
+ */
+// Duplicated from RoaringBitmap
+class RoaringIntPacking {
+
+  /**
+   * 
+   * @param id any long, positive or negative
+   * @return an int holding the 32 highest order bits of information of the input long
+   */
+  public static int high(long id) {
+    return (int) (id >> 32);
+  }
+
+  /**
+   * 
+   * @param id any long, positive or negative
+   * @return an int holding the 32 lowest order bits of information of the input long
+   */
+  public static int low(long id) {
+    return (int) id;
+  }
+
+  /**
+   * 
+   * @param high an integer representing the highest order bits of the output long
+   * @param low an integer representing the lowest order bits of the output long
+   * @return a long packing together the integers as computed by
+   *         {@link RoaringIntPacking#high(long)} and {@link RoaringIntPacking#low(long)}
+   */
+  // https://stackoverflow.com/questions/12772939/java-storing-two-ints-in-a-long
+  public static long pack(int high, int low) {
+    return (((long) high) << 32) | (low & 0xffffffffL);
+  }
+}
diff --git a/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java b/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java
new file mode 100644
index 0000000..33fd562
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/SkippableLongCODEC.java
@@ -0,0 +1,87 @@
+/**
+ * This is code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Interface describing a standard CODEC to compress longs. This is a
+ * variation on the LongCODEC interface meant to be used for random access
+ * (i.e., given a large array, you can segment it and decode just the subarray you need).
+ * 
+ * The main difference is that we must specify the number of longs we wish to
+ * decode. This information should be stored elsewhere.
+ * 
+ * This interface was designed by the Terrier team for their search engine.
+ * 
+ * @author Benoit Lacelle
+ * 
+ */
+public interface SkippableLongCODEC {
+    /**
+     * Compress data from an array to another array.
+     * 
+     * Both inpos and outpos are modified to represent how much data was read
+     * and written to. If 12 longs (inlength = 12) are compressed to 3 longs, then
+     * inpos will be incremented by 12 while outpos will be incremented by 3. We
+     * use IntWrapper to pass the values by reference.
+     * 
+     * @param in
+     *            input array
+     * @param inpos
+     *            where to start reading in the array
+     * @param inlength
+     *            how many longs to compress
+     * @param out
+     *            output array
+     * @param outpos
+     *            where to write in the output array
+     */
+    public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos);
+
+    /**
+     * Uncompress data from an array to another array.
+     * 
+     * Both inpos and outpos parameters are modified to indicate new positions
+     * after read/write.
+     * 
+     * @param in
+     *            array containing data in compressed form
+     * @param inpos
+     *            where to start reading in the array
+     * @param inlength
+     *            length of the compressed data (ignored by some schemes)
+     * @param out
+     *            array where to write the uncompressed output
+     * @param outpos
+     *            where to start writing the uncompressed output in out
+     * @param num
+     *            number of longs we want to decode, the actual number of longs decoded can be less
+     */
+    public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos, int num);
+
+    /**
+     * Compute the maximum number of longs that might be required to store
+     * the compressed form of a given input array segment, without headers.
+     * <p>
+     * This is useful to pre-allocate the output buffer before calling
+     * {@link #headlessCompress(long[], IntWrapper, int, long[], IntWrapper)}.
+     * </p>
+     *
+     * @param compressedPositions
+     *        since not all schemes compress every input integer, this parameter
+     *        returns how many input integers will actually be compressed.
+     *        This is useful when composing multiple schemes.
+     * @param inlength
+     *            number of longs to be compressed
+     * @return the maximum number of longs needed in the output array
+     */
+    int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
+}
diff --git a/src/main/java/me/lemire/longcompression/SkippableLongComposition.java b/src/main/java/me/lemire/longcompression/SkippableLongComposition.java
new file mode 100644
index 0000000..eb03b72
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/SkippableLongComposition.java
@@ -0,0 +1,82 @@
+/**
+ * This is code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+package me.lemire.longcompression;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Helper class to compose schemes.
+ * 
+ * @author Benoit Lacelle
+ */
+public class SkippableLongComposition implements SkippableLongCODEC {
+    SkippableLongCODEC F1, F2;
+
+    /**
+     * Compose a scheme from a first one (f1) and a second one (f2). The first
+     * one is called first and then the second one tries to compress whatever
+     * remains from the first run.
+     * 
+     * By convention, the first scheme should be such that if, during decoding,
+     * a 32-bit zero is first encountered, then there is no output.
+     * 
+     * @param f1
+     *            first codec
+     * @param f2
+     *            second codec
+     */
+    public SkippableLongComposition(SkippableLongCODEC f1,
+            SkippableLongCODEC f2) {
+        F1 = f1;
+        F2 = f2;
+    }
+
+    @Override
+    public void headlessCompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos) {
+        int init = inpos.get();
+        int outposInit = outpos.get();
+        F1.headlessCompress(in, inpos, inlength, out, outpos);
+        if (outpos.get() == outposInit) {
+            out[outposInit] = 0;
+            outpos.increment();
+        }
+        inlength -= inpos.get() - init;
+        F2.headlessCompress(in, inpos, inlength, out, outpos);
+    }
+
+    @Override
+    public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] out,
+            IntWrapper outpos, int num) {
+        int init = inpos.get();
+        int outposInit = outpos.get();
+
+        F1.headlessUncompress(in, inpos, inlength, out, outpos, num);
+        if (inpos.get() == init) {
+              inpos.increment();
+        }
+        inlength -= inpos.get() - init;
+        num -= outpos.get() - outposInit;
+        F2.headlessUncompress(in, inpos, inlength, out, outpos, num);
+    }
+
+    @Override
+    public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+        int init = compressedPositions.get();
+        int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+        maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+        inlength -= compressedPositions.get() - init;
+        maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+        return maxLength;
+    }
+
+    @Override
+    public String toString() {
+        return F1.toString() + "+" + F2.toString();
+    }
+
+}
diff --git a/src/main/java/me/lemire/longcompression/differential/LongDelta.java b/src/main/java/me/lemire/longcompression/differential/LongDelta.java
new file mode 100644
index 0000000..8399f94
--- /dev/null
+++ b/src/main/java/me/lemire/longcompression/differential/LongDelta.java
@@ -0,0 +1,150 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression.differential;
+
+/**
+ * Generic class to compute differential coding.
+ * 
+ * @author Benoit lacelle
+ * 
+ */
+public final class LongDelta {
+
+        /**
+         * Apply differential coding (in-place).
+         * 
+         * @param data
+         *                data to be modified
+         */
+        public static void delta(long[] data) {
+                for (int i = data.length - 1; i > 0; --i) {
+                        data[i] -= data[i - 1];
+                }
+        }
+
+        /**
+         * Apply differential coding (in-place) given an initial value.
+         * 
+         * @param data
+         *                data to be modified
+         * @param start
+         *                starting index
+         * @param length
+         *                number of integers to process
+         * @param init
+         *                initial value
+         * @return next initial vale
+         */
+        public static long delta(long[] data, int start, int length, int init) {
+                final long nextinit = data[start + length - 1];
+                for (int i = length - 1; i > 0; --i) {
+                        data[start + i] -= data[start + i - 1];
+                }
+                data[start] -= init;
+                return nextinit;
+        }
+
+        /**
+         * Compute differential coding given an initial value. Output is written
+         * to a provided array: must have length "length" or better.
+         * 
+         * @param data
+         *                data to be modified
+         * @param start
+         *                starting index
+         * @param length
+         *                number of integers to process
+         * @param init
+         *                initial value
+         * @param out
+         *                output array
+         * @return next initial vale
+         */
+        public static long delta(long[] data, int start, int length, int init,
+                long[] out) {
+                for (int i = length - 1; i > 0; --i) {
+                        out[i] = data[start + i] - data[start + i - 1];
+                }
+                out[0] = data[start] - init;
+                return data[start + length - 1];
+        }
+
+        /**
+         * Undo differential coding (in-place). Effectively computes a prefix
+         * sum.
+         * 
+         * @param data
+         *                to be modified.
+         */
+        public static void inverseDelta(long[] data) {
+                for (int i = 1; i < data.length; ++i) {
+                        data[i] += data[i - 1];
+                }
+        }
+
+        /**
+         * Undo differential coding (in-place). Effectively computes a prefix
+         * sum. Like inverseDelta, only faster.
+         * 
+         * @param data
+         *                to be modified
+         */
+        public static void fastinverseDelta(long[] data) {
+                int sz0 = data.length / 4 * 4;
+                int i = 1;
+                if (sz0 >= 4) {
+                        long a = data[0];
+                        for (; i < sz0 - 4; i += 4) {
+                                a = data[i] += a;
+                                a = data[i + 1] += a;
+                                a = data[i + 2] += a;
+                                a = data[i + 3] += a;
+                        }
+                }
+
+                for (; i < data.length; ++i) {
+                        data[i] += data[i - 1];
+                }
+        }
+
+        /**
+         * Undo differential coding (in-place). Effectively computes a prefix
+         * sum. Like inverseDelta, only faster. Uses an initial value.
+         * 
+         * @param data
+         *                to be modified
+         * @param start
+         *                starting index
+         * @param length
+         *                number of integers to process
+         * @param init
+         *                initial value
+         * @return next initial value
+         */
+        public static long fastinverseDelta(long[] data, int start, int length,
+                int init) {
+                data[start] += init;
+                int sz0 = length / 4 * 4;
+                int i = 1;
+                if (sz0 >= 4) {
+                    long a = data[start];
+                        for (; i < sz0 - 4; i += 4) {
+                                a = data[start + i] += a;
+                                a = data[start + i + 1] += a;
+                                a = data[start + i + 2] += a;
+                                a = data[start + i + 3] += a;
+                        }
+                }
+
+                for (; i != length; ++i) {
+                        data[start + i] += data[start + i - 1];
+                }
+                return data[start + length - 1];
+        }
+
+}
diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
new file mode 100644
index 0000000..f134601
--- /dev/null
+++ b/src/main/java/module-info.java
@@ -0,0 +1,12 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+module me.lemire.integercompression {
+  // This is currently only for advanced users:
+  // requires jdk.incubator.vector;
+  exports me.lemire.integercompression;
+  exports me.lemire.longcompression;
+  exports me.lemire.longcompression.differential;
+  exports me.lemire.integercompression.differential;
+  // exports me.lemire.integercompression.vector;
+}
diff --git a/src/test/java/me/lemire/integercompression/AdhocTest.java b/src/test/java/me/lemire/integercompression/AdhocTest.java
index bced6c0..ee911b3 100644
--- a/src/test/java/me/lemire/integercompression/AdhocTest.java
+++ b/src/test/java/me/lemire/integercompression/AdhocTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import org.junit.Assert;
@@ -15,13 +22,48 @@
 @SuppressWarnings({  "static-method" })
 public class AdhocTest {
 
-    
-    /**
-     * 
-     */
+    @Test
+    public void testIssue59() {
+        FastPFOR128 fastpfor = new FastPFOR128();
+
+        int N = 9984;
+        int[] data = new int[N];
+        for (var i = 0; i < N; i += 150) {
+            data[i] = i;
+        }
+
+        int[] compressedoutput1 = new int[N + 1024];
+
+        IntWrapper inputoffset1 = new IntWrapper(0);
+        IntWrapper outputoffset1 = new IntWrapper(0);
+
+        fastpfor.compress(data, inputoffset1, N, compressedoutput1, outputoffset1);
+        int compressedsize1 = outputoffset1.get();
+
+        int[] recovered1 = new int[N];
+        inputoffset1 = new IntWrapper(0);
+        outputoffset1 = new IntWrapper(0);
+        fastpfor.uncompress(compressedoutput1, outputoffset1, compressedsize1, recovered1, inputoffset1);
+        Assert.assertArrayEquals(data, recovered1);
+
+        int[] compressedoutput2 = new int[N + 1024];
+
+        IntWrapper inputoffset2 = new IntWrapper(0);
+        IntWrapper outputoffset2 = new IntWrapper(0);
+
+        fastpfor.compress(data, inputoffset2, N, compressedoutput2, outputoffset2);
+        int compressedsize2 = outputoffset2.get();
+
+        int[] recovered2 = new int[N];
+        inputoffset2 = new IntWrapper(0);
+        outputoffset2 = new IntWrapper(0);
+        fastpfor.uncompress(compressedoutput2, outputoffset2, compressedsize2, recovered2, inputoffset2);
+        Assert.assertArrayEquals(data, recovered2);
+    }  
+
     @Test
     public void testIssue29() {
-    	    for(int x = 0; x < 64; x++) {
+        for(int x = 0; x < 64; x++) {
           int[] a = {2, 3, 4, 5};
           int[] b = new int[90];
           int[] c = new int[a.length];
@@ -35,7 +77,7 @@ public void testIssue29() {
           IntWrapper cOffset = new IntWrapper(0);
           codec.uncompress(b, bOffset, len, c, cOffset);
           Assert.assertArrayEquals(a,c);
-    	    }
+        }
     }
     
     /**
@@ -43,20 +85,20 @@ public void testIssue29() {
      */
     @Test
     public void testIssue29b() {
-    	    for(int x = 0; x < 64; x++) {
-          int[] a = {2, 3, 4, 5};
-          int[] b = new int[90];
-          int[] c = new int[a.length];
-          SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
-          IntWrapper aOffset = new IntWrapper(0);
-          IntWrapper bOffset = new IntWrapper(x);
-          codec.headlessCompress(a, aOffset, a.length, b, bOffset);
-          int len = bOffset.get() - x;
-          bOffset.set(x);
-          IntWrapper cOffset = new IntWrapper(0);
-          codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length);
-          Assert.assertArrayEquals(a,c);
-    	    }
+        for(int x = 0; x < 64; x++) {
+            SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
+            int[] a = {2, 3, 4, 5};
+            int[] b = new int[x + codec.maxHeadlessCompressedLength(new IntWrapper(0), a.length)];
+            int[] c = new int[a.length];
+            IntWrapper aOffset = new IntWrapper(0);
+            IntWrapper bOffset = new IntWrapper(x);
+            codec.headlessCompress(a, aOffset, a.length, b, bOffset);
+            int len = bOffset.get() - x;
+            bOffset.set(x);
+            IntWrapper cOffset = new IntWrapper(0);
+            codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length);
+            Assert.assertArrayEquals(a,c);
+        }
     }
     
 
@@ -64,30 +106,27 @@ public void testIssue29b() {
      * 
      */
     @Test
-	public void testIssue41() {
-		for (int x = 0; x < 64; x++) {
-			int[] a = { 2, 3, 4, 5 };
-			int[] b = new int[90];
-			int[] c = new int[a.length];
-			SkippableIntegratedIntegerCODEC codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
-					new IntegratedVariableByte());
-			IntWrapper aOffset = new IntWrapper(0);
-			IntWrapper bOffset = new IntWrapper(x);
-			IntWrapper initValue = new IntWrapper(0);
-
-			codec.headlessCompress(a, aOffset, a.length, b, bOffset, initValue);
-			int len = bOffset.get() - x;
-			bOffset.set(x);
-			IntWrapper cOffset = new IntWrapper(0);
-			initValue = new IntWrapper(0);
-			codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length, initValue);
-			Assert.assertArrayEquals(a, c);
-		}
-	}
+    public void testIssue41() {
+        for (int x = 0; x < 64; x++) {
+            SkippableIntegratedIntegerCODEC codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
+                    new IntegratedVariableByte());
+            int[] a = { 2, 3, 4, 5 };
+            int[] b = new int[x + codec.maxHeadlessCompressedLength(new IntWrapper(0), a.length)];
+            int[] c = new int[a.length];
+            IntWrapper aOffset = new IntWrapper(0);
+            IntWrapper bOffset = new IntWrapper(x);
+            IntWrapper initValue = new IntWrapper(0);
+
+            codec.headlessCompress(a, aOffset, a.length, b, bOffset, initValue);
+            int len = bOffset.get() - x;
+            bOffset.set(x);
+            IntWrapper cOffset = new IntWrapper(0);
+            initValue = new IntWrapper(0);
+            codec.headlessUncompress(b, bOffset, len, c, cOffset, a.length, initValue);
+            Assert.assertArrayEquals(a, c);
+        }
+    }
  
-    /**
-     * a test
-     */
     @Test
     public void biggerCompressedArray0() {
         // No problem: for comparison.
@@ -95,12 +134,8 @@ public void biggerCompressedArray0() {
         assertSymmetry(c, 0, 16384);
         c = new Composition(new FastPFOR(), new VariableByte());
         assertSymmetry(c, 0, 16384);
-
     }
 
-    /**
-     * a test
-     */
     @Test
     public void biggerCompressedArray1() {
         // Compressed array is bigger than original, because of VariableByte.
@@ -108,9 +143,6 @@ public void biggerCompressedArray1() {
         assertSymmetry(c, -1);
     }
 
-    /**
-     * a test
-     */
     @Test
     public void biggerCompressedArray2() {
         // Compressed array is bigger than original, because of Composition.
diff --git a/src/test/java/me/lemire/integercompression/BasicTest.java b/src/test/java/me/lemire/integercompression/BasicTest.java
index e88293e..b29ae0d 100644
--- a/src/test/java/me/lemire/integercompression/BasicTest.java
+++ b/src/test/java/me/lemire/integercompression/BasicTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
@@ -22,7 +29,7 @@
  */
 @SuppressWarnings({ "static-method" })
 public class BasicTest {
-    IntegerCODEC[] codecs = {
+    final IntegerCODEC[] codecs = {
             new IntegratedComposition(new IntegratedBinaryPacking(),
                     new IntegratedVariableByte()),
             new JustCopy(),
@@ -41,35 +48,35 @@ public class BasicTest {
             new GroupSimple9(),
             new Composition(new XorBinaryPacking(), new VariableByte()),
             new Composition(new DeltaZigzagBinaryPacking(),
-					new DeltaZigzagVariableByte()) };
+                    new DeltaZigzagVariableByte()) };
 
-	/**
-     * 
+    /**
+     * This tests with a compressed array with various offset
      */
-	@Test
-	public void saulTest() {
-		for (IntegerCODEC C : codecs) {
-			for (int x = 0; x < 50; ++x) {
-				int[] a = { 2, 3, 4, 5 };
-				int[] b = new int[90];
-				int[] c = new int[a.length];
-
-				IntWrapper aOffset = new IntWrapper(0);
-				IntWrapper bOffset = new IntWrapper(x);
-				C.compress(a, aOffset, a.length, b, bOffset);
-				int len = bOffset.get() - x;
-
-				bOffset.set(x);
-				IntWrapper cOffset = new IntWrapper(0);
-				C.uncompress(b, bOffset, len, c, cOffset);
-				if(!Arrays.equals(a, c)) {
-					System.out.println("Problem with "+C);
-				}
-				assertArrayEquals(a, c);
-
-			}
-		}
-	}
+    @Test
+    public void saulTest() {
+        for (IntegerCODEC C : codecs) {
+            for (int x = 0; x < 50; ++x) {
+                int[] a = { 2, 3, 4, 5 };
+                int[] b = new int[90];
+                int[] c = new int[a.length];
+
+                IntWrapper aOffset = new IntWrapper(0);
+                IntWrapper bOffset = new IntWrapper(x);
+                C.compress(a, aOffset, a.length, b, bOffset);
+                int len = bOffset.get() - x;
+
+                bOffset.set(x);
+                IntWrapper cOffset = new IntWrapper(0);
+                C.uncompress(b, bOffset, len, c, cOffset);
+                if(!Arrays.equals(a, c)) {
+                    System.out.println("Problem with "+C);
+                }
+                assertArrayEquals(a, c);
+
+            }
+        }
+    }
     /**
      * 
      */
diff --git a/src/test/java/me/lemire/integercompression/BoundaryTest.java b/src/test/java/me/lemire/integercompression/BoundaryTest.java
index ede2e9f..128b431 100644
--- a/src/test/java/me/lemire/integercompression/BoundaryTest.java
+++ b/src/test/java/me/lemire/integercompression/BoundaryTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
diff --git a/src/test/java/me/lemire/integercompression/ByteBasicTest.java b/src/test/java/me/lemire/integercompression/ByteBasicTest.java
index c2f5b6f..2b2d4f1 100644
--- a/src/test/java/me/lemire/integercompression/ByteBasicTest.java
+++ b/src/test/java/me/lemire/integercompression/ByteBasicTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
@@ -21,32 +28,32 @@ public class ByteBasicTest {
             new IntegratedVariableByte(),
          };
 
-	/**
+    /**
      * 
      */
-	@Test
-	public void saulTest() {
-		for (ByteIntegerCODEC C : codecs) {
-			for (int x = 0; x < 50 * 4; ++x) {
-				int[] a = { 2, 3, 4, 5 };
-				byte[] b = new byte[90*4];
-				int[] c = new int[a.length];
+    @Test
+    public void saulTest() {
+        for (ByteIntegerCODEC C : codecs) {
+            for (int x = 0; x < 50 * 4; ++x) {
+                int[] a = { 2, 3, 4, 5 };
+                byte[] b = new byte[90*4];
+                int[] c = new int[a.length];
 
-				IntWrapper aOffset = new IntWrapper(0);
-				IntWrapper bOffset = new IntWrapper(x);
-				C.compress(a, aOffset, a.length, b, bOffset);
-				int len = bOffset.get() - x;
+                IntWrapper aOffset = new IntWrapper(0);
+                IntWrapper bOffset = new IntWrapper(x);
+                C.compress(a, aOffset, a.length, b, bOffset);
+                int len = bOffset.get() - x;
 
-				bOffset.set(x);
-				IntWrapper cOffset = new IntWrapper(0);
-				C.uncompress(b, bOffset, len, c, cOffset);
-				if(!Arrays.equals(a, c)) {
-					System.out.println("Problem with "+C);
-				}
-				assertArrayEquals(a, c);
-			}
-		}
-	}
+                bOffset.set(x);
+                IntWrapper cOffset = new IntWrapper(0);
+                C.uncompress(b, bOffset, len, c, cOffset);
+                if(!Arrays.equals(a, c)) {
+                    System.out.println("Problem with "+C);
+                }
+                assertArrayEquals(a, c);
+            }
+        }
+    }
     /**
      * 
      */
diff --git a/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java b/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java
index 5e0923d..ae42c1d 100644
--- a/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java
+++ b/src/test/java/me/lemire/integercompression/DeltaZigzagEncodingTest.java
@@ -1,7 +1,10 @@
-/*
+/**
  * This code is released under the
  * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
  */
+
 package me.lemire.integercompression;
 
 import org.junit.Test;
diff --git a/src/test/java/me/lemire/integercompression/ExampleTest.java b/src/test/java/me/lemire/integercompression/ExampleTest.java
index 300983c..c63c69b 100644
--- a/src/test/java/me/lemire/integercompression/ExampleTest.java
+++ b/src/test/java/me/lemire/integercompression/ExampleTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import me.lemire.integercompression.differential.*;
@@ -10,305 +17,303 @@
  *
  */
 public class ExampleTest {
-	/**
-	 * 
-	 */
-	@Test
-
-	public void superSimpleExample() {
-		IntegratedIntCompressor iic = new IntegratedIntCompressor();
-		int[] data = new int[2342351];
-		for (int k = 0; k < data.length; ++k)
-			data[k] = k;
-		System.out.println("Compressing " + data.length + " integers using friendly interface");
-		int[] compressed = iic.compress(data);
-		int[] recov = iic.uncompress(compressed);
-		System.out
-				.println("compressed from " + data.length * 4 / 1024 + "KB to " + compressed.length * 4 / 1024 + "KB");
-		if (!Arrays.equals(recov, data))
-			throw new RuntimeException("bug");
-	}
-
-	/**
-	 * 
-	 */
-	@Test
-
-	public void basicExample() {
-		int[] data = new int[2342351];
-		System.out.println("Compressing " + data.length + " integers in one go");
-		// data should be sorted for best
-		// results
-		for (int k = 0; k < data.length; ++k)
-			data[k] = k;
-		// Very important: the data is in sorted order!!! If not, you
-		// will get very poor compression with IntegratedBinaryPacking,
-		// you should use another CODEC.
-
-		// next we compose a CODEC. Most of the processing
-		// will be done with binary packing, and leftovers will
-		// be processed using variable byte
-		IntegratedIntegerCODEC codec = new IntegratedComposition(new IntegratedBinaryPacking(),
-				new IntegratedVariableByte());
-		// output vector should be large enough...
-		int[] compressed = new int[data.length + 1024];
-		// compressed might not be large enough in some cases
-		// if you get java.lang.ArrayIndexOutOfBoundsException, try
-		// allocating more memory
-
-		/**
-		 *
-		 * compressing
-		 *
-		 */
-		IntWrapper inputoffset = new IntWrapper(0);
-		IntWrapper outputoffset = new IntWrapper(0);
-		codec.compress(data, inputoffset, data.length, compressed, outputoffset);
-		// got it!
-		// inputoffset should be at data.length but outputoffset tells
-		// us where we are...
-		System.out.println(
-				"compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
-		// we can repack the data: (optional)
-		compressed = Arrays.copyOf(compressed, outputoffset.intValue());
-
-		/**
-		 *
-		 * now uncompressing
-		 *
-		 * This assumes that we otherwise know how many integers have been
-		 * compressed. See basicExampleHeadless for a more general case.
-		 */
-		int[] recovered = new int[data.length];
-		IntWrapper recoffset = new IntWrapper(0);
-		codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset);
-		if (Arrays.equals(data, recovered))
-			System.out.println("data is recovered without loss");
-		else
-			throw new RuntimeException("bug"); // could use assert
-		System.out.println();
-	}
-
-	/**
-	 * Like the basicExample, but we store the input array size manually.
-	 */
-	@Test
-	public void basicExampleHeadless() {
-		int[] data = new int[2342351];
-		System.out.println("Compressing " + data.length + " integers in one go using the headless approach");
-		// data should be sorted for best
-		// results
-		for (int k = 0; k < data.length; ++k)
-			data[k] = k;
-		// Very important: the data is in sorted order!!! If not, you
-		// will get very poor compression with IntegratedBinaryPacking,
-		// you should use another CODEC.
-
-		// next we compose a CODEC. Most of the processing
-		// will be done with binary packing, and leftovers will
-		// be processed using variable byte
-		SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
-				new IntegratedVariableByte());
-		// output vector should be large enough...
-		int[] compressed = new int[data.length + 1024];
-		// compressed might not be large enough in some cases
-		// if you get java.lang.ArrayIndexOutOfBoundsException, try
-		// allocating more memory
-
-		/**
-		 *
-		 * compressing
-		 *
-		 */
-		IntWrapper inputoffset = new IntWrapper(0);
-		IntWrapper outputoffset = new IntWrapper(1);
-		compressed[0] = data.length; // we manually store how many integers we
-		codec.headlessCompress(data, inputoffset, data.length, compressed, outputoffset, new IntWrapper(0));					
-		// got it!
-		// inputoffset should be at data.length but outputoffset tells
-		// us where we are...
-		System.out.println(
-				"compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
-		// we can repack the data: (optional)
-		compressed = Arrays.copyOf(compressed, outputoffset.intValue());
-
-		/**
-		 *
-		 * now uncompressing
-		 *
-		 */
-		int howmany = compressed[0];// we manually stored the number of
-									// compressed integers
-		int[] recovered = new int[howmany];
-		IntWrapper recoffset = new IntWrapper(0);
-		codec.headlessUncompress(compressed, new IntWrapper(1), compressed.length, recovered, recoffset, howmany, new IntWrapper(0));
-		if (Arrays.equals(data, recovered))
-			System.out.println("data is recovered without loss");
-		else
-			throw new RuntimeException("bug"); // could use assert
-		System.out.println();
-	}
-
-	/**
-	 * This is an example to show you can compress unsorted integers as long as
-	 * most are small.
-	 */
-	@Test
-	public void unsortedExample() {
-		final int N = 1333333;
-		int[] data = new int[N];
-		// initialize the data (most will be small
-		for (int k = 0; k < N; k += 1)
-			data[k] = 3;
-		// throw some larger values
-		for (int k = 0; k < N; k += 5)
-			data[k] = 100;
-		for (int k = 0; k < N; k += 533)
-			data[k] = 10000;
-		int[] compressed = new int[N + 1024];// could need more
-		IntegerCODEC codec = new Composition(new FastPFOR(), new VariableByte());
-		// compressing
-		IntWrapper inputoffset = new IntWrapper(0);
-		IntWrapper outputoffset = new IntWrapper(0);
-		codec.compress(data, inputoffset, data.length, compressed, outputoffset);
-		System.out.println("compressed unsorted integers from " + data.length * 4 / 1024 + "KB to "
-				+ outputoffset.intValue() * 4 / 1024 + "KB");
-		// we can repack the data: (optional)
-		compressed = Arrays.copyOf(compressed, outputoffset.intValue());
-
-		int[] recovered = new int[N];
-		IntWrapper recoffset = new IntWrapper(0);
-		codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset);
-		if (Arrays.equals(data, recovered))
-			System.out.println("data is recovered without loss");
-		else
-			throw new RuntimeException("bug"); // could use assert
-		System.out.println();
-
-	}
-
-	/**
-	 * This is like the basic example, but we show how to process larger arrays
-	 * in chunks.
-	 *
-	 * Some of this code was written by Pavel Klinov.
-	 */
-	@Test
-	public void advancedExample() {
-		int TotalSize = 2342351; // some arbitrary number
-		int ChunkSize = 16384; // size of each chunk, choose a multiple of 128
-		System.out.println("Compressing " + TotalSize + " integers using chunks of " + ChunkSize + " integers ("
-				+ ChunkSize * 4 / 1024 + "KB)");
-		System.out.println("(It is often better for applications to work in chunks fitting in CPU cache.)");
-		int[] data = new int[TotalSize];
-		// data should be sorted for best
-		// results
-		for (int k = 0; k < data.length; ++k)
-			data[k] = k;
-		// next we compose a CODEC. Most of the processing
-		// will be done with binary packing, and leftovers will
-		// be processed using variable byte, using variable byte
-		// only for the last chunk!
-		IntegratedIntegerCODEC regularcodec = new IntegratedBinaryPacking();
-		IntegratedVariableByte ivb = new IntegratedVariableByte();
-		IntegratedIntegerCODEC lastcodec = new IntegratedComposition(regularcodec, ivb);
-		// output vector should be large enough...
-		int[] compressed = new int[TotalSize + 1024];
-
-		/**
-		 *
-		 * compressing
-		 *
-		 */
-		IntWrapper inputoffset = new IntWrapper(0);
-		IntWrapper outputoffset = new IntWrapper(0);
-		for (int k = 0; k < TotalSize / ChunkSize; ++k)
-			regularcodec.compress(data, inputoffset, ChunkSize, compressed, outputoffset);
-		lastcodec.compress(data, inputoffset, TotalSize % ChunkSize, compressed, outputoffset);
-		// got it!
-		// inputoffset should be at data.length but outputoffset tells
-		// us where we are...
-		System.out.println(
-				"compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
-		// we can repack the data:
-		compressed = Arrays.copyOf(compressed, outputoffset.intValue());
-
-		/**
-		 *
-		 * now uncompressing
-		 *
-		 * We are *not* assuming that the original array length is known,
-		 * however we assume that the chunk size (ChunkSize) is known.
-		 *
-		 */
-		int[] recovered = new int[ChunkSize];
-		IntWrapper compoff = new IntWrapper(0);
-		IntWrapper recoffset;
-		int currentpos = 0;
-
-		while (compoff.get() < compressed.length) {
-			recoffset = new IntWrapper(0);
-			regularcodec.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset);
-
-			if (recoffset.get() < ChunkSize) {// last chunk detected
-				ivb.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset);
-			}
-			for (int i = 0; i < recoffset.get(); ++i) {
-				if (data[currentpos + i] != recovered[i])
-					throw new RuntimeException("bug"); // could use assert
-			}
-			currentpos += recoffset.get();
-		}
-		System.out.println("data is recovered without loss");
-		System.out.println();
-
-	}
-
-	/**
-	 * Demo of the headless approach where we must supply the array length
-	 */
-	@Test
-	public void headlessDemo() {
-		System.out.println("Compressing arrays with minimal header...");
-		int[] uncompressed1 = { 1, 2, 1, 3, 1 };
-		int[] uncompressed2 = { 3, 2, 4, 6, 1 };
-
-		int[] compressed = new int[uncompressed1.length + uncompressed2.length + 1024];
-
-		SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
-
-		// compressing
-		IntWrapper outPos = new IntWrapper();
-
-		IntWrapper previous = new IntWrapper();
-
-		codec.headlessCompress(uncompressed1, new IntWrapper(), uncompressed1.length, compressed, outPos);
-		int length1 = outPos.get() - previous.get();
-		previous = new IntWrapper(outPos.get());
-		codec.headlessCompress(uncompressed2, new IntWrapper(), uncompressed2.length, compressed, outPos);
-		int length2 = outPos.get() - previous.get();
-
-		compressed = Arrays.copyOf(compressed, length1 + length2);
-		System.out
-				.println("compressed unsorted integers from " + uncompressed1.length * 4 + "B to " + length1 * 4 + "B");
-		System.out
-				.println("compressed unsorted integers from " + uncompressed2.length * 4 + "B to " + length2 * 4 + "B");
-		System.out.println("Total compressed output " + compressed.length);
-
-		int[] recovered1 = new int[uncompressed1.length];
-		int[] recovered2 = new int[uncompressed1.length];
-		IntWrapper inPos = new IntWrapper();
-		System.out.println("Decoding first array starting at pos = " + inPos);
-		codec.headlessUncompress(compressed, inPos, compressed.length, recovered1, new IntWrapper(0),
-				uncompressed1.length);
-		System.out.println("Decoding second array starting at pos = " + inPos);
-		codec.headlessUncompress(compressed, inPos, compressed.length, recovered2, new IntWrapper(0),
-				uncompressed2.length);
-		if (!Arrays.equals(uncompressed1, recovered1))
-			throw new RuntimeException("First array does not match.");
-		if (!Arrays.equals(uncompressed2, recovered2))
-			throw new RuntimeException("Second array does not match.");
-		System.out.println("The arrays match, your code is probably ok.");
-
-	}
+    /**
+     * 
+     */
+    @Test
+
+    public void superSimpleExample() {
+        IntegratedIntCompressor iic = new IntegratedIntCompressor();
+        int[] data = new int[2342351];
+        for (int k = 0; k < data.length; ++k)
+            data[k] = k;
+        System.out.println("Compressing " + data.length + " integers using friendly interface");
+        int[] compressed = iic.compress(data);
+        int[] recov = iic.uncompress(compressed);
+        System.out
+                .println("compressed from " + data.length * 4 / 1024 + "KB to " + compressed.length * 4 / 1024 + "KB");
+        if (!Arrays.equals(recov, data))
+            throw new RuntimeException("bug");
+    }
+
+    /**
+     * 
+     */
+    @Test
+
+    public void basicExample() {
+        int[] data = new int[2342351];
+        System.out.println("Compressing " + data.length + " integers in one go");
+        // data should be sorted for best
+        // results
+        for (int k = 0; k < data.length; ++k)
+            data[k] = k;
+        // Very important: the data is in sorted order!!! If not, you
+        // will get very poor compression with IntegratedBinaryPacking,
+        // you should use another CODEC.
+
+        // next we compose a CODEC. Most of the processing
+        // will be done with binary packing, and leftovers will
+        // be processed using variable byte
+        IntegratedIntegerCODEC codec = new IntegratedComposition(new IntegratedBinaryPacking(),
+                new IntegratedVariableByte());
+        // output vector should be large enough...
+        int[] compressed = new int[data.length + 1024];
+        // compressed might not be large enough in some cases
+        // if you get java.lang.ArrayIndexOutOfBoundsException, try
+        // allocating more memory
+
+        /**
+         *
+         * compressing
+         *
+         */
+        IntWrapper inputoffset = new IntWrapper(0);
+        IntWrapper outputoffset = new IntWrapper(0);
+        codec.compress(data, inputoffset, data.length, compressed, outputoffset);
+        // got it!
+        // inputoffset should be at data.length but outputoffset tells
+        // us where we are...
+        System.out.println(
+                "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
+        // we can repack the data: (optional)
+        compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+        /**
+         *
+         * now uncompressing
+         *
+         * This assumes that we otherwise know how many integers have been
+         * compressed. See basicExampleHeadless for a more general case.
+         */
+        int[] recovered = new int[data.length];
+        IntWrapper recoffset = new IntWrapper(0);
+        codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset);
+        if (Arrays.equals(data, recovered))
+            System.out.println("data is recovered without loss");
+        else
+            throw new RuntimeException("bug"); // could use assert
+        System.out.println();
+    }
+
+    /**
+     * Like the basicExample, but we store the input array size manually.
+     */
+    @Test
+    public void basicExampleHeadless() {
+        int[] data = new int[2342351];
+        System.out.println("Compressing " + data.length + " integers in one go using the headless approach");
+        // data should be sorted for best
+        // results
+        for (int k = 0; k < data.length; ++k)
+            data[k] = k;
+        // Very important: the data is in sorted order!!! If not, you
+        // will get very poor compression with IntegratedBinaryPacking,
+        // you should use another CODEC.
+
+        // next we compose a CODEC. Most of the processing
+        // will be done with binary packing, and leftovers will
+        // be processed using variable byte
+        SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
+                new IntegratedVariableByte());
+        int[] compressed = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), data.length)];
+
+        /**
+         *
+         * compressing
+         *
+         */
+        IntWrapper inputoffset = new IntWrapper(0);
+        IntWrapper outputoffset = new IntWrapper(1);
+        compressed[0] = data.length; // we manually store how many integers we
+        codec.headlessCompress(data, inputoffset, data.length, compressed, outputoffset, new IntWrapper(0));                    
+        // got it!
+        // inputoffset should be at data.length but outputoffset tells
+        // us where we are...
+        System.out.println(
+                "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
+        // we can repack the data: (optional)
+        compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+        /**
+         *
+         * now uncompressing
+         *
+         */
+        int howmany = compressed[0];// we manually stored the number of
+                                    // compressed integers
+        int[] recovered = new int[howmany];
+        IntWrapper recoffset = new IntWrapper(0);
+        codec.headlessUncompress(compressed, new IntWrapper(1), compressed.length, recovered, recoffset, howmany, new IntWrapper(0));
+        if (Arrays.equals(data, recovered))
+            System.out.println("data is recovered without loss");
+        else
+            throw new RuntimeException("bug"); // could use assert
+        System.out.println();
+    }
+
+    /**
+     * This is an example to show you can compress unsorted integers as long as
+     * most are small.
+     */
+    @Test
+    public void unsortedExample() {
+        final int N = 1333333;
+        int[] data = new int[N];
+        // initialize the data (most will be small
+        for (int k = 0; k < N; k += 1)
+            data[k] = 3;
+        // throw some larger values
+        for (int k = 0; k < N; k += 5)
+            data[k] = 100;
+        for (int k = 0; k < N; k += 533)
+            data[k] = 10000;
+        int[] compressed = new int[N + 1024];// could need more
+        IntegerCODEC codec = new Composition(new FastPFOR(), new VariableByte());
+        // compressing
+        IntWrapper inputoffset = new IntWrapper(0);
+        IntWrapper outputoffset = new IntWrapper(0);
+        codec.compress(data, inputoffset, data.length, compressed, outputoffset);
+        System.out.println("compressed unsorted integers from " + data.length * 4 / 1024 + "KB to "
+                + outputoffset.intValue() * 4 / 1024 + "KB");
+        // we can repack the data: (optional)
+        compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+        int[] recovered = new int[N];
+        IntWrapper recoffset = new IntWrapper(0);
+        codec.uncompress(compressed, new IntWrapper(0), compressed.length, recovered, recoffset);
+        if (Arrays.equals(data, recovered))
+            System.out.println("data is recovered without loss");
+        else
+            throw new RuntimeException("bug"); // could use assert
+        System.out.println();
+
+    }
+
+    /**
+     * This is like the basic example, but we show how to process larger arrays
+     * in chunks.
+     *
+     * Some of this code was written by Pavel Klinov.
+     */
+    @Test
+    public void advancedExample() {
+        int TotalSize = 2342351; // some arbitrary number
+        int ChunkSize = 16384; // size of each chunk, choose a multiple of 128
+        System.out.println("Compressing " + TotalSize + " integers using chunks of " + ChunkSize + " integers ("
+                + ChunkSize * 4 / 1024 + "KB)");
+        System.out.println("(It is often better for applications to work in chunks fitting in CPU cache.)");
+        int[] data = new int[TotalSize];
+        // data should be sorted for best
+        // results
+        for (int k = 0; k < data.length; ++k)
+            data[k] = k;
+        // next we compose a CODEC. Most of the processing
+        // will be done with binary packing, and leftovers will
+        // be processed using variable byte, using variable byte
+        // only for the last chunk!
+        IntegratedIntegerCODEC regularcodec = new IntegratedBinaryPacking();
+        IntegratedVariableByte ivb = new IntegratedVariableByte();
+        IntegratedIntegerCODEC lastcodec = new IntegratedComposition(regularcodec, ivb);
+        // output vector should be large enough...
+        int[] compressed = new int[TotalSize + 1024];
+
+        /**
+         *
+         * compressing
+         *
+         */
+        IntWrapper inputoffset = new IntWrapper(0);
+        IntWrapper outputoffset = new IntWrapper(0);
+        for (int k = 0; k < TotalSize / ChunkSize; ++k)
+            regularcodec.compress(data, inputoffset, ChunkSize, compressed, outputoffset);
+        lastcodec.compress(data, inputoffset, TotalSize % ChunkSize, compressed, outputoffset);
+        // got it!
+        // inputoffset should be at data.length but outputoffset tells
+        // us where we are...
+        System.out.println(
+                "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
+        // we can repack the data:
+        compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+        /**
+         *
+         * now uncompressing
+         *
+         * We are *not* assuming that the original array length is known,
+         * however we assume that the chunk size (ChunkSize) is known.
+         *
+         */
+        int[] recovered = new int[ChunkSize];
+        IntWrapper compoff = new IntWrapper(0);
+        IntWrapper recoffset;
+        int currentpos = 0;
+
+        while (compoff.get() < compressed.length) {
+            recoffset = new IntWrapper(0);
+            regularcodec.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset);
+
+            if (recoffset.get() < ChunkSize) {// last chunk detected
+                ivb.uncompress(compressed, compoff, compressed.length - compoff.get(), recovered, recoffset);
+            }
+            for (int i = 0; i < recoffset.get(); ++i) {
+                if (data[currentpos + i] != recovered[i])
+                    throw new RuntimeException("bug"); // could use assert
+            }
+            currentpos += recoffset.get();
+        }
+        System.out.println("data is recovered without loss");
+        System.out.println();
+
+    }
+
+    /**
+     * Demo of the headless approach where we must supply the array length
+     */
+    @Test
+    public void headlessDemo() {
+        System.out.println("Compressing arrays with minimal header...");
+        int[] uncompressed1 = { 1, 2, 1, 3, 1 };
+        int[] uncompressed2 = { 3, 2, 4, 6, 1 };
+
+        SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
+
+        int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed1.length)
+                + codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed2.length);
+        int[] compressed = new int[maxCompressedLength];
+
+        // compressing
+        IntWrapper outPos = new IntWrapper();
+
+        IntWrapper previous = new IntWrapper();
+
+        codec.headlessCompress(uncompressed1, new IntWrapper(), uncompressed1.length, compressed, outPos);
+        int length1 = outPos.get() - previous.get();
+        previous = new IntWrapper(outPos.get());
+        codec.headlessCompress(uncompressed2, new IntWrapper(), uncompressed2.length, compressed, outPos);
+        int length2 = outPos.get() - previous.get();
+
+        compressed = Arrays.copyOf(compressed, length1 + length2);
+        System.out
+                .println("compressed unsorted integers from " + uncompressed1.length * 4 + "B to " + length1 * 4 + "B");
+        System.out
+                .println("compressed unsorted integers from " + uncompressed2.length * 4 + "B to " + length2 * 4 + "B");
+        System.out.println("Total compressed output " + compressed.length);
+
+        int[] recovered1 = new int[uncompressed1.length];
+        int[] recovered2 = new int[uncompressed1.length];
+        IntWrapper inPos = new IntWrapper();
+        System.out.println("Decoding first array starting at pos = " + inPos);
+        codec.headlessUncompress(compressed, inPos, compressed.length, recovered1, new IntWrapper(0),
+                uncompressed1.length);
+        System.out.println("Decoding second array starting at pos = " + inPos);
+        codec.headlessUncompress(compressed, inPos, compressed.length, recovered2, new IntWrapper(0),
+                uncompressed2.length);
+        if (!Arrays.equals(uncompressed1, recovered1))
+            throw new RuntimeException("First array does not match.");
+        if (!Arrays.equals(uncompressed2, recovered2))
+            throw new RuntimeException("Second array does not match.");
+        System.out.println("The arrays match, your code is probably ok.");
+
+    }
 }
diff --git a/src/test/java/me/lemire/integercompression/IntCompressorTest.java b/src/test/java/me/lemire/integercompression/IntCompressorTest.java
index 34b8946..79e51fc 100644
--- a/src/test/java/me/lemire/integercompression/IntCompressorTest.java
+++ b/src/test/java/me/lemire/integercompression/IntCompressorTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
diff --git a/src/test/java/me/lemire/integercompression/ResourcedTest.java b/src/test/java/me/lemire/integercompression/ResourcedTest.java
index 61b8e58..8316129 100644
--- a/src/test/java/me/lemire/integercompression/ResourcedTest.java
+++ b/src/test/java/me/lemire/integercompression/ResourcedTest.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.ArrayList;
@@ -17,65 +24,65 @@
  *
  */
 public class ResourcedTest {
-	SkippableIntegerCODEC[] codecs = { new JustCopy(), new VariableByte(),
-			new SkippableComposition(new BinaryPacking(), new VariableByte()),
-			new SkippableComposition(new NewPFD(), new VariableByte()),
-			new SkippableComposition(new NewPFDS9(), new VariableByte()),
-			new SkippableComposition(new NewPFDS16(), new VariableByte()),
-			new SkippableComposition(new OptPFD(), new VariableByte()),
-			new SkippableComposition(new OptPFDS9(), new VariableByte()),
-			new SkippableComposition(new OptPFDS16(), new VariableByte()),
-			new SkippableComposition(new FastPFOR128(), new VariableByte()),
-			new SkippableComposition(new FastPFOR(), new VariableByte()), new Simple9(), new Simple16() };
+    SkippableIntegerCODEC[] codecs = { new JustCopy(), new VariableByte(),
+            new SkippableComposition(new BinaryPacking(), new VariableByte()),
+            new SkippableComposition(new NewPFD(), new VariableByte()),
+            new SkippableComposition(new NewPFDS9(), new VariableByte()),
+            new SkippableComposition(new NewPFDS16(), new VariableByte()),
+            new SkippableComposition(new OptPFD(), new VariableByte()),
+            new SkippableComposition(new OptPFDS9(), new VariableByte()),
+            new SkippableComposition(new OptPFDS16(), new VariableByte()),
+            new SkippableComposition(new FastPFOR128(), new VariableByte()),
+            new SkippableComposition(new FastPFOR(), new VariableByte()), new Simple9(), new Simple16() };
 
-	/**
-	 * @throws IOException
-	 *             if the resource cannot be accessed (should be considered a
-	 *             bug)
-	 * 
-	 */
-	@Test
-	public void IntCompressorTest() throws IOException {
-		// next line requires Java8?
-		// int[] data =
-		// Files.lines(Paths.get("integers.txt")).mapToInt(Integer::parseInt).toArray();
-		File f = new File("src/test/resources/integers.txt");
-		System.out.println("loading test data from "+ f.getAbsolutePath());
-		BufferedReader bfr = new BufferedReader(new FileReader(f));
-		String line;
-		ArrayList<Integer> ai = new ArrayList<Integer>();
-		while ((line = bfr.readLine()) != null) {
-			ai.add(Integer.parseInt(line));
-		}
-		bfr.close();
-		int[] data = new int[ai.size()];
-		for (int k = 0; k < data.length; ++k)
-			data[k] = ai.get(k).intValue();
-		ai = null;
-		// finally!
-		{
-			IntegratedIntCompressor iic = new IntegratedIntCompressor();
-			int[] compressed = iic.compress(data);
-			int[] recovered = iic.uncompress(compressed);
-			Assert.assertArrayEquals(recovered, data);
-		}
-		for (SkippableIntegerCODEC C : codecs) {
-			IntCompressor iic = new IntCompressor(C);
-			int[] compressed = iic.compress(data);
-			int[] recovered = iic.uncompress(compressed);
-			Assert.assertArrayEquals(recovered, data);
+    /**
+     * @throws IOException
+     *             if the resource cannot be accessed (should be considered a
+     *             bug)
+     * 
+     */
+    @Test
+    public void IntCompressorTest() throws IOException {
+        // next line requires Java8?
+        // int[] data =
+        // Files.lines(Paths.get("integers.txt")).mapToInt(Integer::parseInt).toArray();
+        File f = new File("src/test/resources/integers.txt");
+        System.out.println("loading test data from "+ f.getAbsolutePath());
+        BufferedReader bfr = new BufferedReader(new FileReader(f));
+        String line;
+        ArrayList<Integer> ai = new ArrayList<Integer>();
+        while ((line = bfr.readLine()) != null) {
+            ai.add(Integer.parseInt(line));
+        }
+        bfr.close();
+        int[] data = new int[ai.size()];
+        for (int k = 0; k < data.length; ++k)
+            data[k] = ai.get(k).intValue();
+        ai = null;
+        // finally!
+        {
+            IntegratedIntCompressor iic = new IntegratedIntCompressor();
+            int[] compressed = iic.compress(data);
+            int[] recovered = iic.uncompress(compressed);
+            Assert.assertArrayEquals(recovered, data);
+        }
+        for (SkippableIntegerCODEC C : codecs) {
+            IntCompressor iic = new IntCompressor(C);
+            int[] compressed = iic.compress(data);
+            int[] recovered = iic.uncompress(compressed);
+            Assert.assertArrayEquals(recovered, data);
 
-		}
-		for (SkippableIntegerCODEC C : codecs) {
-			if (C instanceof SkippableIntegratedIntegerCODEC) {
-				IntegratedIntCompressor iic = new IntegratedIntCompressor((SkippableIntegratedIntegerCODEC) C);
-				int[] compressed = iic.compress(data);
-				int[] recovered = iic.uncompress(compressed);
-				Assert.assertArrayEquals(recovered, data);
-			}
+        }
+        for (SkippableIntegerCODEC C : codecs) {
+            if (C instanceof SkippableIntegratedIntegerCODEC) {
+                IntegratedIntCompressor iic = new IntegratedIntCompressor((SkippableIntegratedIntegerCODEC) C);
+                int[] compressed = iic.compress(data);
+                int[] recovered = iic.uncompress(compressed);
+                Assert.assertArrayEquals(recovered, data);
+            }
 
-		}
+        }
 
-	}
+    }
 
 }
diff --git a/src/test/java/me/lemire/integercompression/SkippableBasicTest.java b/src/test/java/me/lemire/integercompression/SkippableBasicTest.java
index d965992..881dada 100644
--- a/src/test/java/me/lemire/integercompression/SkippableBasicTest.java
+++ b/src/test/java/me/lemire/integercompression/SkippableBasicTest.java
@@ -1,9 +1,22 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
 
+import me.lemire.integercompression.differential.IntegratedBinaryPacking;
+import me.lemire.integercompression.differential.IntegratedVariableByte;
+import me.lemire.integercompression.differential.SkippableIntegratedComposition;
+import me.lemire.integercompression.differential.SkippableIntegratedIntegerCODEC;
 import org.junit.Test;
 
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertTrue;
 
 /**
  * Just some basic sanity tests.
@@ -12,7 +25,7 @@
  */
 @SuppressWarnings({ "static-method" })
 public class SkippableBasicTest {
-    SkippableIntegerCODEC[] codecs = {
+    final SkippableIntegerCODEC[] codecs = {
             new JustCopy(),
             new VariableByte(),
             new SkippableComposition(new BinaryPacking(), new VariableByte()),
@@ -41,10 +54,11 @@ public void consistentTest() {
         for (SkippableIntegerCODEC c : codecs) {
             System.out.println("[SkippeableBasicTest.consistentTest] codec = "
                     + c);
-            int[] outBuf = new int[N + 1024];
             for (int n = 0; n <= N; ++n) {
                 IntWrapper inPos = new IntWrapper();
                 IntWrapper outPos = new IntWrapper();
+                int[] outBuf = new int[c.maxHeadlessCompressedLength(new IntWrapper(0), n)];
+
                 c.headlessCompress(data, inPos, n, outBuf, outPos);
 
                 IntWrapper inPoso = new IntWrapper();
@@ -140,5 +154,135 @@ public void varyingLengthTest2() {
         }
     }
 
+    @Test
+    public void testMaxHeadlessCompressedLength() {
+        testMaxHeadlessCompressedLength(new IntegratedBinaryPacking(), 16 * IntegratedBinaryPacking.BLOCK_SIZE);
+        testMaxHeadlessCompressedLength(new IntegratedVariableByte(), 128);
+        testMaxHeadlessCompressedLength(new SkippableIntegratedComposition(new IntegratedBinaryPacking(), new IntegratedVariableByte()), 16 * IntegratedBinaryPacking.BLOCK_SIZE + 10);
+
+        testMaxHeadlessCompressedLength(new BinaryPacking(), 16 * BinaryPacking.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new VariableByte(), 128, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new BinaryPacking(), new VariableByte()), 16 * BinaryPacking.BLOCK_SIZE + 10, 32);
+        testMaxHeadlessCompressedLength(new JustCopy(), 128, 32);
+        testMaxHeadlessCompressedLength(new Simple9(), 128, 28);
+        testMaxHeadlessCompressedLength(new Simple16(), 128, 28);
+        testMaxHeadlessCompressedLength(new GroupSimple9(), 128, 28);
+        testMaxHeadlessCompressedLength(new OptPFD(), 4 * OptPFD.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new OptPFD(), new VariableByte()), 4 * OptPFD.BLOCK_SIZE + 10, 32);
+        testMaxHeadlessCompressedLength(new OptPFDS9(), 4 * OptPFDS9.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new OptPFDS9(), new VariableByte()), 4 * OptPFDS9.BLOCK_SIZE + 10, 32);
+        testMaxHeadlessCompressedLength(new OptPFDS16(), 4 * OptPFDS16.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new OptPFDS9(), new VariableByte()), 4 * OptPFDS16.BLOCK_SIZE + 10, 32);
+        testMaxHeadlessCompressedLength(new NewPFD(), 4 * NewPFD.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new NewPFD(), new VariableByte()), 4 * NewPFD.BLOCK_SIZE + 10, 32);
+        testMaxHeadlessCompressedLength(new NewPFDS9(), 4 * NewPFDS9.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new NewPFDS9(), new VariableByte()), 4 * NewPFDS9.BLOCK_SIZE + 10, 32);
+        testMaxHeadlessCompressedLength(new NewPFDS16(), 4 * NewPFDS16.BLOCK_SIZE, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new NewPFDS16(), new VariableByte()), 4 * NewPFDS16.BLOCK_SIZE + 10, 32);
+
+        int fastPfor128PageSize = FastPFOR128.BLOCK_SIZE * 4; // smaller page size than the default to speed up the test
+        testMaxHeadlessCompressedLength(new FastPFOR128(fastPfor128PageSize), 2 * fastPfor128PageSize, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new FastPFOR128(fastPfor128PageSize), new VariableByte()), 2 * fastPfor128PageSize + 10, 32);
+        int fastPforPageSize = FastPFOR.BLOCK_SIZE * 4; // smaller page size than the default to speed up the test
+        testMaxHeadlessCompressedLength(new FastPFOR(fastPforPageSize), 2 * fastPforPageSize, 32);
+        testMaxHeadlessCompressedLength(new SkippableComposition(new FastPFOR(fastPforPageSize), new VariableByte()), 2 * fastPforPageSize + 10, 32);
+    }
+
+    private static void testMaxHeadlessCompressedLength(SkippableIntegratedIntegerCODEC codec, int inlengthTo) {
+        // We test the worst-case scenario by making all deltas and the initial value negative.
+        int delta = -1;
+        int value = delta;
+
+        for (int inlength = 0; inlength < inlengthTo; ++inlength) {
+            int[] input = new int[inlength];
+            for (int i = 0; i < inlength; i++) {
+                input[i] = value;
+                value += delta;
+            }
+
+            int maxOutputLength = codec.maxHeadlessCompressedLength(new IntWrapper(), inlength);
+            int[] output = new int[maxOutputLength];
+            IntWrapper outPos = new IntWrapper();
+
+            codec.headlessCompress(input, new IntWrapper(), inlength, output, outPos, new IntWrapper());
+            // If we reach this point, no exception was thrown, which means the calculated output length was sufficient.
+
+            assertTrue(maxOutputLength <= outPos.get() + 1); // +1 because SkippableIntegratedComposition always adds one extra integer for the potential header
+        }
+    }
+
+    private static void testMaxHeadlessCompressedLength(SkippableIntegerCODEC codec, int inlengthTo, int maxBitWidth) {
+        // Some schemes ignore bit widths between 21 and 31. Therefore, in addition to maxBitWidth - 1, we also test 20.
+        assertTrue(maxBitWidth >= 20);
+        int[] regularValueBitWidths = { 20, maxBitWidth - 1 };
+
+        for (int inlength = 0; inlength < inlengthTo; ++inlength) {
+            int[] input = new int[inlength];
+
+            int maxOutputLength = codec.maxHeadlessCompressedLength(new IntWrapper(), inlength);
+            int[] output = new int[maxOutputLength];
+
+            for (int exceptionCount = 0; exceptionCount < inlength; exceptionCount++) {
+                int exception = maxBitWidth == 32 ? -1 : (1 << maxBitWidth) - 1;
+
+                for (int regularValueBitWidth : regularValueBitWidths) {
+                    int regularValue = regularValueBitWidth == 32 ? -1 : (1 << regularValueBitWidth) - 1;
+
+                    Arrays.fill(input, 0, exceptionCount, exception);
+                    Arrays.fill(input, exceptionCount, input.length, regularValue);
+
+                    codec.headlessCompress(input, new IntWrapper(), inlength, output, new IntWrapper());
+                    // If we reach this point, no exception was thrown, which means the calculated output length was sufficient.
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testUncompressOutputOffset_SkippableComposition() {
+        for (int offset : new int[] {0, 1, 6}) {
+            SkippableComposition codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
+
+            int[] input = { 2, 3, 4, 5 };
+            int[] compressed = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length)];
+            int[] uncompressed = new int[offset + input.length];
+
+            IntWrapper inputOffset = new IntWrapper(0);
+            IntWrapper compressedOffset = new IntWrapper(0);
+
+            codec.headlessCompress(input, inputOffset, input.length, compressed, compressedOffset);
+
+            int compressedLength = compressedOffset.get();
+            IntWrapper uncompressedOffset = new IntWrapper(offset);
+            compressedOffset = new IntWrapper(0);
+            codec.headlessUncompress(compressed, compressedOffset, compressedLength, uncompressed, uncompressedOffset, input.length);
+
+            assertArrayEquals(input, Arrays.copyOfRange(uncompressed, offset, offset + input.length));
+        }
+    }
+
+    @Test
+    public void testUncompressOutputOffset_SkippableIntegratedComposition() {
+        for (int offset : new int[] {0, 1, 6}) {
+            SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(), new IntegratedVariableByte());
+
+            int[] input = { 2, 3, 4, 5 };
+            int[] compressed = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length)];
+            int[] uncompressed = new int[offset + input.length];
 
+            IntWrapper inputOffset = new IntWrapper(0);
+            IntWrapper compressedOffset = new IntWrapper(0);
+            IntWrapper initValue = new IntWrapper(0);
+
+            codec.headlessCompress(input, inputOffset, input.length, compressed, compressedOffset, initValue);
+
+            int compressedLength = compressedOffset.get();
+            IntWrapper uncompressedOffset = new IntWrapper(offset);
+            compressedOffset = new IntWrapper(0);
+            initValue = new IntWrapper(0);
+            codec.headlessUncompress(compressed, compressedOffset, compressedLength, uncompressed, uncompressedOffset, input.length, initValue);
+
+            assertArrayEquals(input, Arrays.copyOfRange(uncompressed, offset, offset + input.length));
+        }
+    }
 }
diff --git a/src/test/java/me/lemire/integercompression/TestUtils.java b/src/test/java/me/lemire/integercompression/TestUtils.java
index a0820ab..b3cbff3 100644
--- a/src/test/java/me/lemire/integercompression/TestUtils.java
+++ b/src/test/java/me/lemire/integercompression/TestUtils.java
@@ -1,3 +1,10 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
@@ -123,7 +130,7 @@ public static void assertSymmetry(IntegerCODEC codec, int... orig) {
         assertArrayEquals(orig, target);
     }
 
-    protected static int[] compress(IntegerCODEC codec, int[] data) {
+    public static int[] compress(IntegerCODEC codec, int[] data) {
         int[] outBuf = new int[data.length * 4];
         IntWrapper inPos = new IntWrapper();
         IntWrapper outPos = new IntWrapper();
@@ -158,7 +165,7 @@ protected static int[] uncompress(ByteIntegerCODEC codec, byte[] data, int len)
     }
 
     protected static int[] compressHeadless(SkippableIntegerCODEC codec, int[] data) {
-        int[] outBuf = new int[data.length * 4];
+        int[] outBuf = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), data.length)];
         IntWrapper inPos = new IntWrapper();
         IntWrapper outPos = new IntWrapper();
         codec.headlessCompress(data, inPos, data.length, outBuf, outPos);
diff --git a/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java b/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java
index 3201b02..650eb4b 100644
--- a/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java
+++ b/src/test/java/me/lemire/integercompression/XorBinaryPackingTest.java
@@ -1,7 +1,10 @@
 /**
  * This code is released under the
  * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
  */
+
 package me.lemire.integercompression;
 
 import java.util.Arrays;
diff --git a/src/test/java/me/lemire/longcompression/ATestLongCODEC.java b/src/test/java/me/lemire/longcompression/ATestLongCODEC.java
new file mode 100644
index 0000000..c61ea69
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/ATestLongCODEC.java
@@ -0,0 +1,96 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import java.util.stream.LongStream;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Edge-cases to be tested on a per-codec basis
+ * 
+ * @author Benoit Lacelle
+ */
+public abstract class ATestLongCODEC {
+	protected void checkConsistency(LongCODEC codec, long[] array) {
+		{
+			long[] compressed = LongTestUtils.compress(codec, array);
+			long[] uncompressed = LongTestUtils.uncompress(codec, compressed, array.length);
+
+			Assert.assertArrayEquals(array, uncompressed);
+		}
+
+		if (codec instanceof ByteLongCODEC) {
+			byte[] compressed = LongTestUtils.compress((ByteLongCODEC) codec, array);
+			long[] uncompressed = LongTestUtils.uncompress((ByteLongCODEC) codec, compressed, array.length);
+
+			Assert.assertArrayEquals(array, uncompressed);
+		}
+
+		if (codec instanceof SkippableLongCODEC) {
+			long[] compressed = LongTestUtils.compressHeadless((SkippableLongCODEC) codec, array);
+			long[] uncompressed =
+					LongTestUtils.uncompressHeadless((SkippableLongCODEC) codec, compressed, array.length);
+
+			Assert.assertArrayEquals(array, uncompressed);
+		}
+	}
+
+	public abstract LongCODEC getCodec();
+
+	@Test
+	public void testCodec_Minus1() {
+		checkConsistency(getCodec(), new long[] { -1 });
+	}
+
+	@Test
+	public void testCodec_ZeroTimes8Minus1() {
+		checkConsistency(getCodec(), new long[] { 0, 0, 0, 0, 0, 0, 0, 0, -1 });
+	}
+
+	@Test
+	public void testCodec_ZeroTimes127Minus1() {
+		long[] array = LongStream.concat(LongStream.range(0, 127).map(l -> 0), LongStream.of(-1)).toArray();
+
+		checkConsistency(getCodec(), array);
+	}
+
+	@Test
+	public void testCodec_ZeroTimes128Minus1() {
+		long[] array = LongStream.concat(LongStream.range(0, 128).map(l -> 0), LongStream.of(-1)).toArray();
+
+		checkConsistency(getCodec(), array);
+	}
+
+	@Test
+	public void testCodec_MinValue() {
+		checkConsistency(getCodec(), new long[] { Long.MIN_VALUE });
+	}
+
+	@Test
+	public void testCodec_ZeroMinValue() {
+		checkConsistency(getCodec(), new long[] { 0, Long.MIN_VALUE });
+	}
+
+	@Test
+	public void testCodec_allPowerOfTwo() {
+		checkConsistency(getCodec(), new long[] { 1L << 42 });
+		for (int i = 0; i < 64; i++) {
+			checkConsistency(getCodec(), new long[] { 1L << i });
+		}
+	}
+
+	@Test
+	public void testCodec_ZeroThenAllPowerOfTwo() {
+		for (int i = 0; i < 64; i++) {
+			checkConsistency(getCodec(), new long[] { 0, 1L << i });
+		}
+	}
+
+}
diff --git a/src/test/java/me/lemire/longcompression/LongBasicTest.java b/src/test/java/me/lemire/longcompression/LongBasicTest.java
new file mode 100644
index 0000000..8dc0c9b
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/LongBasicTest.java
@@ -0,0 +1,391 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import me.lemire.integercompression.FastPFOR;
+import me.lemire.integercompression.FastPFOR128;
+import me.lemire.integercompression.IntWrapper;
+import me.lemire.longcompression.differential.LongDelta;
+import me.lemire.longcompression.synth.LongClusteredDataGenerator;
+
+/**
+ * Just some basic sanity tests.
+ * 
+ * @author Benoit Lacelle
+ */
+@SuppressWarnings({ "static-method" })
+public class LongBasicTest {
+    final LongCODEC[] codecs = {
+            new LongJustCopy(),
+            new LongVariableByte(),
+            new LongAs2IntsCodec(),
+            new LongComposition(new LongBinaryPacking(), new LongVariableByte()),
+            };
+
+    /**
+     * This tests with a compressed array with various offset
+     */
+    @Test
+    public void saulTest() {
+        for (LongCODEC C : codecs) {
+            for (int x = 0; x < 50; ++x) {
+                long[] a = { 2, 3, 4, 5 };
+                long[] b = new long[90];
+                long[] c = new long[a.length];
+
+                IntWrapper aOffset = new IntWrapper(0);
+                IntWrapper bOffset = new IntWrapper(x);
+                C.compress(a, aOffset, a.length, b, bOffset);
+                int len = bOffset.get() - x;
+
+                bOffset.set(x);
+                IntWrapper cOffset = new IntWrapper(0);
+                C.uncompress(b, bOffset, len, c, cOffset);
+                if(!Arrays.equals(a, c)) {
+                    System.out.println("Problem with "+C);
+                }
+                assertArrayEquals(a, c);
+
+            }
+        }
+    }
+    /**
+     * 
+     */
+    @Test
+    public void varyingLengthTest() {
+        int N = 4096;
+        long[] data = new long[N];
+        for (int k = 0; k < N; ++k)
+            data[k] = k;
+        for (LongCODEC c : codecs) {
+            System.out.println("[BasicTest.varyingLengthTest] codec = " + c);
+            for (int L = 1; L <= 128; L++) {
+                long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompress(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k]) {
+                        long[] comp2 = LongTestUtils.compress(c, Arrays.copyOf(data, L));
+                        long[] answer2 = LongTestUtils.uncompress(c, comp2, L);
+                    	throw new RuntimeException("bug");
+                    }
+            }
+            for (int L = 128; L <= N; L *= 2) {
+                long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompress(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k]) {
+                        long[] comp2 = LongTestUtils.compress(c, Arrays.copyOf(data, L));
+                        long[] answer2 = LongTestUtils.uncompress(c, comp2, L);
+                        System.out.println(Arrays.toString(Arrays.copyOf(
+                                answer, L)));
+                        System.out.println(Arrays.toString(Arrays.copyOf(data,
+                                L)));
+                        throw new RuntimeException("bug");
+                    }
+            }
+
+        }
+    }
+
+    /**
+     * 
+     */
+    @Test
+    public void varyingLengthTest2() {
+        int N = 128;
+        long[] data = new long[N];
+        data[127] = -1;
+        for (LongCODEC c : codecs) {
+            System.out.println("[BasicTest.varyingLengthTest2] codec = " + c);
+            try {
+                // CODEC Simple9 is limited to "small" integers.
+                if (c.getClass().equals(
+                        Class.forName("me.lemire.integercompression.Simple9")))
+                    continue;
+            } catch (ClassNotFoundException e) {
+                e.printStackTrace();
+            }
+            try {
+                // CODEC Simple16 is limited to "small" integers.
+                if (c.getClass().equals(
+                        Class.forName("me.lemire.integercompression.Simple16")))
+                    continue;
+            } catch (ClassNotFoundException e) {
+                e.printStackTrace();
+            }
+            try {
+                // CODEC GroupSimple9 is limited to "small" integers.
+                if (c.getClass().equals(
+                        Class.forName("me.lemire.integercompression.GroupSimple9")))
+                    continue;
+            } catch (ClassNotFoundException e) {
+                e.printStackTrace();
+            }
+
+            for (int L = 1; L <= 128; L++) {
+                long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompress(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k])
+                        throw new RuntimeException("bug");
+            }
+            for (int L = 128; L <= N; L *= 2) {
+                long[] comp = LongTestUtils.compress(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompress(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k])
+                        throw new RuntimeException("bug");
+            }
+
+        }
+    }
+
+    /**
+     * 
+     */
+    @Test
+    public void checkVariousCases() {
+        for (LongCODEC c : codecs) {
+            testZeroInZeroOut(c);
+            test(c, c, 5, 10);
+            test(c, c, 5, 14);
+            test(c, c, 2, 18);
+            // TODO Unclear which codec should manage an empty output array or not
+            // Some IntegerCodec does not output anything if the input is smaller than some block size
+            // testSpurious(c);
+            testUnsorted(c);
+            testUnsorted2(c);
+            testUnsorted3(c);
+        }
+    }
+
+    /**
+     * check that the codecs can be inverted.
+     */
+    @Test
+    public void basictest() {
+        for (LongCODEC codec : codecs) {
+            test(codec, 5, 10);
+            test(codec, 5, 14);
+            test(codec, 2, 18);
+        }
+    }
+
+    private static void testSpurious(LongCODEC c) {
+        long[] x = new long[1024];
+        long[] y = new long[0];
+        IntWrapper i0 = new IntWrapper(0);
+        IntWrapper i1 = new IntWrapper(0);
+        for (int inlength = 0; inlength < 32; ++inlength) {
+            c.compress(x, i0, inlength, y, i1);
+            assertEquals(0, i1.intValue());
+        }
+    }
+
+    private static void testZeroInZeroOut(LongCODEC c) {
+        long[] x = new long[0];
+        long[] y = new long[0];
+        IntWrapper i0 = new IntWrapper(0);
+        IntWrapper i1 = new IntWrapper(0);
+        c.compress(x, i0, 0, y, i1);
+        assertEquals(0, i1.intValue());
+
+        long[] out = new long[0];
+        IntWrapper outpos = new IntWrapper(0);
+        c.uncompress(y, i1, 0, out, outpos);
+        assertEquals(0, outpos.intValue());
+    }
+
+    private static void test(LongCODEC c, LongCODEC co, int N, int nbr) {
+        LongClusteredDataGenerator cdg = new LongClusteredDataGenerator();
+        for (int sparsity = 1; sparsity < 31 - nbr; sparsity += 4) {
+            long[][] data = new long[N][];
+            int max = (1 << (nbr + sparsity));
+            for (int k = 0; k < N; ++k) {
+                data[k] = cdg.generateClustered((1 << nbr), max);
+            }
+            testCodec(c, co, data, max);
+        }
+    }
+
+    private static void test(LongCODEC codec, int N, int nbr) {
+        LongClusteredDataGenerator cdg = new LongClusteredDataGenerator();
+        System.out.println("[BasicTest.test] N = " + N + " " + nbr);
+        for (int sparsity = 1; sparsity < 63 - nbr; sparsity += 4) {
+            long[][] data = new long[N][];
+            long max = (1L << (nbr + sparsity));
+            for (int k = 0; k < N; ++k) {
+                data[k] = cdg.generateClustered((1 << nbr), max);
+            }
+
+            testCodec(codec, codec, data, max);
+        }
+    }
+
+    private static void testCodec(LongCODEC c, LongCODEC co,
+            long[][] data, long max) {
+        int N = data.length;
+        int maxlength = 0;
+        for (int k = 0; k < N; ++k) {
+            if (data[k].length > maxlength)
+                maxlength = data[k].length;
+        }
+        long[] buffer = new long[maxlength + 1024];
+        long[] dataout = new long[4 * maxlength + 1024];
+        // 4x + 1024 to account for the possibility of some negative
+        // compression.
+        IntWrapper inpos = new IntWrapper();
+        IntWrapper outpos = new IntWrapper();
+        for (int k = 0; k < N; ++k) {
+            long[] backupdata = Arrays.copyOf(data[k], data[k].length);
+
+            inpos.set(1);
+            outpos.set(0);
+            if (!(c instanceof IntegratedLongCODEC)) {
+                LongDelta.delta(backupdata);
+            }
+            c.compress(backupdata, inpos, backupdata.length - inpos.get(),
+                    dataout, outpos);
+            final int thiscompsize = outpos.get() + 1;
+            inpos.set(0);
+            outpos.set(1);
+            buffer[0] = backupdata[0];
+            co.uncompress(dataout, inpos, thiscompsize - 1, buffer, outpos);
+            if (!(c instanceof IntegratedLongCODEC))
+                LongDelta.fastinverseDelta(buffer);
+
+            // Check assertions.
+            assertEquals("length is not match", outpos.get(), data[k].length);
+            long[] bufferCutout = Arrays.copyOf(buffer, outpos.get());
+            assertArrayEquals("failed to reconstruct original data", data[k],
+                    bufferCutout);
+        }
+    }
+
+    /**
+     * @param codec
+     *            provided codec
+     */
+    public void testUnsorted(LongCODEC codec) {
+        int[] lengths = { 133, 1026, 1333333 };
+        for (int N : lengths) {
+            long[] data = new long[N];
+            // initialize the data (most will be small)
+            for (int k = 0; k < N; k += 1)
+                data[k] = 3;
+            // throw some larger values
+            for (int k = 0; k < N; k += 5)
+                data[k] = 100;
+            for (int k = 0; k < N; k += 533)
+                data[k] = 10000;
+            data[5] = -311;
+            // could need more compressing
+            long[] compressed = new long[(int) Math.ceil(N * 1.01) + 1024];
+            IntWrapper inputoffset = new IntWrapper(0);
+            IntWrapper outputoffset = new IntWrapper(0);
+            codec.compress(data, inputoffset, data.length, compressed,
+                    outputoffset);
+            // we can repack the data: (optional)
+            compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+            long[] recovered = new long[N];
+            IntWrapper recoffset = new IntWrapper(0);
+            codec.uncompress(compressed, new IntWrapper(0), compressed.length,
+                    recovered, recoffset);
+            assertArrayEquals(data, recovered);
+        }
+    }
+
+    private void testUnsorted2(LongCODEC codec) {
+        long[] data = new long[128];
+        data[5] = -1;
+        long[] compressed = new long[1024];
+        IntWrapper inputoffset = new IntWrapper(0);
+        IntWrapper outputoffset = new IntWrapper(0);
+        codec.compress(data, inputoffset, data.length, compressed, outputoffset);
+        // we can repack the data: (optional)
+        compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+        long[] recovered = new long[128];
+        IntWrapper recoffset = new IntWrapper(0);
+        codec.uncompress(compressed, new IntWrapper(0), compressed.length,
+                recovered, recoffset);
+        assertArrayEquals(data, recovered);
+    }
+
+    private void testUnsorted3(LongCODEC codec) {
+        long[] data = new long[128];
+        data[127] = -1;
+        long[] compressed = new long[1024];
+        IntWrapper inputoffset = new IntWrapper(0);
+        IntWrapper outputoffset = new IntWrapper(0);
+        codec.compress(data, inputoffset, data.length, compressed, outputoffset);
+        // we can repack the data: (optional)
+        compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+        long[] recovered = new long[128];
+        IntWrapper recoffset = new IntWrapper(0);
+        codec.uncompress(compressed, new IntWrapper(0), compressed.length,
+                recovered, recoffset);
+        assertArrayEquals(data, recovered);
+    }
+
+    /**
+         * 
+         */
+    @Test
+    public void fastPforTest() {
+        // proposed by Stefan Ackermann (https://github.com/Stivo)
+    	for (LongCODEC codec : codecs) {
+	        int N = FastPFOR.BLOCK_SIZE;
+	        long[] data = new long[N];
+	        for (int i = 0; i < N; i++)
+	            data[i] = 0;
+	        data[126] = -1;
+	        long[] comp = LongTestUtils.compress(codec, Arrays.copyOf(data, N));
+	        long[] answer = LongTestUtils.uncompress(codec, comp, N);
+	        for (int k = 0; k < N; ++k)
+	            if (answer[k] != data[k]) {
+	    	        long[] comp2 = LongTestUtils.compress(codec, Arrays.copyOf(data, N));
+	    	        long[] answer2 = LongTestUtils.uncompress(codec, comp2, N);
+	                throw new RuntimeException("bug " + k + " " + answer[k]
+	                        + " != " + data[k]);
+	            }
+    	}
+    }
+
+    /**
+     * 
+     */
+    @Test
+    public void fastPfor128Test() {
+        // proposed by Stefan Ackermann (https://github.com/Stivo)
+        for (LongCODEC codec : codecs) {
+            int N = FastPFOR128.BLOCK_SIZE;
+            long[] data = new long[N];
+            for (int i = 0; i < N; i++)
+                data[i] = 0;
+            data[126] = -1;
+            long[] comp = LongTestUtils.compress(codec, Arrays.copyOf(data, N));
+            long[] answer = LongTestUtils.uncompress(codec, comp, N);
+            for (int k = 0; k < N; ++k)
+                if (answer[k] != data[k])
+                    throw new RuntimeException("bug " + k + " " + answer[k]
+                            + " != " + data[k]);
+        }
+    }
+
+}
diff --git a/src/test/java/me/lemire/longcompression/LongDeltaTest.java b/src/test/java/me/lemire/longcompression/LongDeltaTest.java
new file mode 100644
index 0000000..bfa1e6f
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/LongDeltaTest.java
@@ -0,0 +1,23 @@
+package me.lemire.longcompression;
+
+import me.lemire.longcompression.differential.LongDelta;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertNotNull;
+
+public class LongDeltaTest {
+    @Test
+    public void testEmptyArrayFastInverseDelta() {
+        LongCompressor compressor = new LongCompressor();
+        long[] input = new long[0];
+
+        LongDelta.delta(input);
+        long[] compressed = compressor.compress(input);
+        long[] result = compressor.uncompress(compressed);
+        LongDelta.fastinverseDelta(result);
+
+        assertNotNull(result);
+        assertArrayEquals(input, result);
+    }
+}
diff --git a/src/test/java/me/lemire/longcompression/LongTestUtils.java b/src/test/java/me/lemire/longcompression/LongTestUtils.java
new file mode 100644
index 0000000..b7d9c63
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/LongTestUtils.java
@@ -0,0 +1,133 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Arrays;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * Static utility methods for test.
+ */
+public class LongTestUtils {
+    
+    protected static void dumpIntArray(long[] data, String label) {
+        System.out.print(label);
+        for (int i = 0; i < data.length; ++i) {
+            if (i % 6 == 0) {
+                System.out.println();
+            }
+            System.out.format(" %1$11d", data[i]);
+        }
+        System.out.println();
+    }
+
+    protected static void dumpIntArrayAsHex(long[] data, String label) {
+        System.out.print(label);
+        for (int i = 0; i < data.length; ++i) {
+            if (i % 8 == 0) {
+                System.out.println();
+            }
+            System.out.format(" %1$08X", data[i]);
+        }
+        System.out.println();
+    }
+
+    /**
+     * Check that compress and uncompress keep original array.
+     *
+     * @param codec CODEC to test.
+     * @param orig  original integers
+     */
+    public static void assertSymmetry(LongCODEC codec, long... orig) {
+        // There are some cases that compressed array is bigger than original
+        // array.  So output array for compress must be larger.
+        //
+        // Example:
+        //  - VariableByte compresses an array like [ -1 ].
+        //  - Composition compresses a short array.
+        final int EXTEND = 1;
+
+        long[] compressed = new long[orig.length + EXTEND];
+        IntWrapper c_inpos = new IntWrapper(0);
+        IntWrapper c_outpos = new IntWrapper(0);
+        codec.compress(orig, c_inpos, orig.length, compressed,
+                c_outpos);
+
+        assertTrue(c_outpos.get() <= orig.length + EXTEND);
+
+        // Uncompress an array.
+        long[] uncompressed = new long[orig.length];
+        IntWrapper u_inpos = new IntWrapper(0);
+        IntWrapper u_outpos = new IntWrapper(0);
+        codec.uncompress(compressed, u_inpos, c_outpos.get(),
+                uncompressed, u_outpos);
+
+        // Compare between uncompressed and orig arrays.
+        long[] target = Arrays.copyOf(uncompressed, u_outpos.get());
+        assertArrayEquals(orig, target);
+    }
+
+    protected static long[] compress(LongCODEC codec, long[] data) {
+        long[] outBuf = new long[data.length * 8];
+        IntWrapper inPos = new IntWrapper();
+        IntWrapper outPos = new IntWrapper();
+        codec.compress(data, inPos, data.length, outBuf, outPos);
+        return Arrays.copyOf(outBuf, outPos.get());
+    }
+
+    protected static long[] uncompress(LongCODEC codec, long[] data, int len) {
+        long[] outBuf = new long[len + 1024];
+        IntWrapper inPos = new IntWrapper();
+        IntWrapper outPos = new IntWrapper();
+        codec.uncompress(data, inPos, data.length, outBuf, outPos);
+        return Arrays.copyOf(outBuf, outPos.get());
+    }
+
+
+
+    protected static byte[] compress(ByteLongCODEC codec, long[] data) {
+        byte[] outBuf = new byte[data.length * 4 * 4];
+        IntWrapper inPos = new IntWrapper();
+        IntWrapper outPos = new IntWrapper();
+        codec.compress(data, inPos, data.length, outBuf, outPos);
+        return Arrays.copyOf(outBuf, outPos.get());
+    }
+
+    protected static long[] uncompress(ByteLongCODEC codec, byte[] data, int len) {
+        long[] outBuf = new long[len + 1024];
+        IntWrapper inPos = new IntWrapper();
+        IntWrapper outPos = new IntWrapper();
+        codec.uncompress(data, inPos, data.length, outBuf, outPos);
+        return Arrays.copyOf(outBuf, outPos.get());
+    }
+
+    protected static long[] compressHeadless(SkippableLongCODEC codec, long[] data) {
+        long[] outBuf = new long[codec.maxHeadlessCompressedLength(new IntWrapper(0), data.length)];
+        IntWrapper inPos = new IntWrapper();
+        IntWrapper outPos = new IntWrapper();
+        codec.headlessCompress(data, inPos, data.length, outBuf, outPos);
+        return Arrays.copyOf(outBuf, outPos.get());
+    }
+
+    protected static long[] uncompressHeadless(SkippableLongCODEC codec, long[] data, int len) {
+        long[] outBuf = new long[len + 1024];
+        IntWrapper inPos = new IntWrapper();
+        IntWrapper outPos = new IntWrapper();
+        codec.headlessUncompress(data, inPos, data.length, outBuf, outPos,len);
+        if(outPos.get() < len) throw new RuntimeException("Insufficient output.");
+        return Arrays.copyOf(outBuf, outPos.get());
+    }
+
+    public static String longToBinaryWithLeading(long l) {
+        return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0');
+    }
+}
diff --git a/src/test/java/me/lemire/longcompression/SkippableLongBasicTest.java b/src/test/java/me/lemire/longcompression/SkippableLongBasicTest.java
new file mode 100644
index 0000000..c4b7e01
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/SkippableLongBasicTest.java
@@ -0,0 +1,194 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import me.lemire.integercompression.IntWrapper;
+import me.lemire.integercompression.TestUtils;
+import me.lemire.integercompression.VariableByte;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Just some basic sanity tests.
+ * 
+ * @author Benoit Lacelle
+ */
+@SuppressWarnings({ "static-method" })
+public class SkippableLongBasicTest {
+    final SkippableLongCODEC[] codecs = {
+            new LongJustCopy(),
+            new LongVariableByte(),
+            new SkippableLongComposition(new LongBinaryPacking(), new LongVariableByte()), };
+
+    
+    /**
+     * 
+     */
+    @Test
+    public void consistentTest() {
+        int N = 4096;
+        long[] data = new long[N];
+        long[] rev = new long[N];
+        for (int k = 0; k < N; ++k)
+            data[k] = k % 128;
+        for (SkippableLongCODEC c : codecs) {
+            System.out.println("[SkippeableBasicTest.consistentTest] codec = "
+                    + c);
+            for (int n = 0; n <= N; ++n) {
+                IntWrapper inPos = new IntWrapper();
+                IntWrapper outPos = new IntWrapper();
+                long[] outBuf = new long[c.maxHeadlessCompressedLength(new IntWrapper(0), n)];
+
+                c.headlessCompress(data, inPos, n, outBuf, outPos);
+
+                IntWrapper inPoso = new IntWrapper();
+                IntWrapper outPoso = new IntWrapper();
+                c.headlessUncompress(outBuf, inPoso, outPos.get(), rev,
+                        outPoso, n);
+                if (outPoso.get() != n) {
+                    throw new RuntimeException("bug "+n);
+                }
+                if (inPoso.get() != outPos.get()) {
+                    throw new RuntimeException("bug "+n+" "+inPoso.get()+" "+outPos.get());
+                }
+                for (int j = 0; j < n; ++j)
+                    if (data[j] != rev[j]) {
+                        throw new RuntimeException("bug");
+                    }
+            }
+        }
+    }
+
+    
+    /**
+     * 
+     */
+    @Test
+    public void varyingLengthTest() {
+        int N = 4096;
+        long[] data = new long[N];
+        for (int k = 0; k < N; ++k)
+            data[k] = k;
+        for (SkippableLongCODEC c : codecs) {
+            System.out.println("[SkippeableBasicTest.varyingLengthTest] codec = "+c);
+            for (int L = 1; L <= 128; L++) {
+                long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompressHeadless(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k])
+                        throw new RuntimeException("bug "+c.toString()+" "+k+" "+answer[k]+" "+data[k]);
+            }
+            for (int L = 128; L <= N; L *= 2) {
+                long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompressHeadless(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k])
+                        throw new RuntimeException("bug");
+            }
+
+        }
+    }
+
+    /**
+     * 
+     */
+    @Test
+    public void varyingLengthTest2() {
+        int N = 128;
+        long[] data = new long[N];
+        data[127] = -1;
+        for (SkippableLongCODEC c : codecs) {
+            System.out.println("[SkippeableBasicTest.varyingLengthTest2] codec = "+c);
+
+            try {
+                // CODEC Simple9 is limited to "small" integers.
+                if (c.getClass().equals(
+                        Class.forName("me.lemire.integercompression.Simple9")))
+                    continue;
+            } catch (ClassNotFoundException e) {
+                e.printStackTrace();
+            }
+            try {
+                // CODEC Simple16 is limited to "small" integers.
+                if (c.getClass().equals(
+                        Class.forName("me.lemire.integercompression.Simple16")))
+                    continue;
+            } catch (ClassNotFoundException e) {
+                e.printStackTrace();
+            }
+            for (int L = 1; L <= 128; L++) {
+                long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompressHeadless(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k]) {
+                        throw new RuntimeException("L=" + L + ": bug at k = "+k+" "+answer[k]+" "+data[k]+" for "+c.toString());
+                    }
+            }
+            for (int L = 128; L <= N; L *= 2) {
+                long[] comp = LongTestUtils.compressHeadless(c, Arrays.copyOf(data, L));
+                long[] answer = LongTestUtils.uncompressHeadless(c, comp, L);
+                for (int k = 0; k < L; ++k)
+                    if (answer[k] != data[k])
+                        throw new RuntimeException("bug");
+            }
+
+        }
+    }
+
+    @Test
+    public void testMaxHeadlessCompressedLength() {
+        testMaxHeadlessCompressedLength(new LongJustCopy(), 128);
+        testMaxHeadlessCompressedLength(new LongBinaryPacking(), 16 * LongBinaryPacking.BLOCK_SIZE);
+        testMaxHeadlessCompressedLength(new LongVariableByte(), 128);
+        testMaxHeadlessCompressedLength(new SkippableLongComposition(new LongBinaryPacking(), new LongVariableByte()), 16 * LongBinaryPacking.BLOCK_SIZE + 10);
+    }
+
+    private static void testMaxHeadlessCompressedLength(SkippableLongCODEC codec, int inlengthTo) {
+        for (int inlength = 0; inlength < inlengthTo; ++inlength) {
+            long[] input = new long[inlength];
+            Arrays.fill(input, -1L);
+
+            int maxOutputLength = codec.maxHeadlessCompressedLength(new IntWrapper(), inlength);
+            long[] output = new long[maxOutputLength];
+            IntWrapper outPos = new IntWrapper();
+
+            codec.headlessCompress(input, new IntWrapper(), inlength, output, outPos);
+            // If we reach this point, no exception was thrown, which means the calculated output length was sufficient.
+
+            assertTrue(maxOutputLength <= outPos.get() + 1); // +1 because SkippableLongComposition always adds one extra integer for the potential header
+        }
+    }
+
+    @Test
+    public void testUncompressOutputOffset_SkippableLongComposition() {
+        for (int offset : new int[] {0, 1, 6}) {
+            SkippableLongComposition codec = new SkippableLongComposition(new LongBinaryPacking(), new LongVariableByte());
+
+            long[] input = { 2, 3, 4, 5 };
+            long[] compressed = new long[codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length)];
+            long[] uncompressed = new long[offset + input.length];
+
+            IntWrapper inputOffset = new IntWrapper(0);
+            IntWrapper compressedOffset = new IntWrapper(0);
+
+            codec.headlessCompress(input, inputOffset, input.length, compressed, compressedOffset);
+
+            int compressedLength = compressedOffset.get();
+            IntWrapper uncompressedOffset = new IntWrapper(offset);
+            compressedOffset = new IntWrapper(0);
+            codec.headlessUncompress(compressed, compressedOffset, compressedLength, uncompressed, uncompressedOffset, input.length);
+
+            assertArrayEquals(input, Arrays.copyOfRange(uncompressed, offset, offset + input.length));
+        }
+    }
+}
diff --git a/src/test/java/me/lemire/longcompression/TestLongAs2IntsCodec.java b/src/test/java/me/lemire/longcompression/TestLongAs2IntsCodec.java
new file mode 100644
index 0000000..bddff2a
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/TestLongAs2IntsCodec.java
@@ -0,0 +1,31 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Edge-cases having caused issue specifically with LongVariableByte.
+ * 
+ * @author Benoit Lacelle
+ */
+public class TestLongAs2IntsCodec extends ATestLongCODEC {
+	final LongAs2IntsCodec codec = new LongAs2IntsCodec();
+
+	@Override
+	public LongCODEC getCodec() {
+		return codec;
+	}
+
+    @Test
+    public void testCodec_intermediateHighPowerOfTwo() {
+        Assert.assertEquals(3, LongTestUtils.compress((LongCODEC) codec, new long[] { 1L << 42 }).length);
+    }
+
+}
diff --git a/src/test/java/me/lemire/longcompression/TestLongBinaryPacking.java b/src/test/java/me/lemire/longcompression/TestLongBinaryPacking.java
new file mode 100644
index 0000000..ecc3f2e
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/TestLongBinaryPacking.java
@@ -0,0 +1,26 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import org.junit.Ignore;
+
+/**
+ * Edge-cases having caused issue specifically with LongBinaryPacking.
+ * 
+ * @author Benoit Lacelle
+ */
+@Ignore("Parent class tests are not valid as LongBinaryPacking process by chunks of 64 longs")
+public class TestLongBinaryPacking extends ATestLongCODEC {
+	final LongBinaryPacking codec = new LongBinaryPacking();
+
+	@Override
+	public LongCODEC getCodec() {
+		return codec;
+	}
+
+}
diff --git a/src/test/java/me/lemire/longcompression/TestLongVariableByte.java b/src/test/java/me/lemire/longcompression/TestLongVariableByte.java
new file mode 100644
index 0000000..3cb2a49
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/TestLongVariableByte.java
@@ -0,0 +1,40 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+package me.lemire.longcompression;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Edge-cases having caused issue specifically with LongVariableByte.
+ * 
+ * @author Benoit Lacelle
+ */
+public class TestLongVariableByte extends ATestLongCODEC {
+	final LongVariableByte codec = new LongVariableByte();
+
+	@Override
+	public LongCODEC getCodec() {
+		return codec;
+	}
+
+	@Test
+	public void testCodec_allBitWidths() {
+		for (int bitWidth = 0; bitWidth <= 64; bitWidth++) {
+			long value = bitWidth == 0 ? 0 : 1L << (bitWidth - 1);
+
+			int expectedSizeInBytes = Math.max(1, (bitWidth + 6) / 7);
+			int expectedSizeInLongs = (expectedSizeInBytes > 8) ? 2 : 1;
+
+			Assert.assertEquals(expectedSizeInLongs, LongTestUtils.compress((LongCODEC) codec, new long[] { value }).length);
+			Assert.assertEquals(expectedSizeInBytes, LongTestUtils.compress((ByteLongCODEC) codec, new long[] { value }).length);
+			Assert.assertEquals(expectedSizeInLongs,
+					LongTestUtils.compressHeadless((SkippableLongCODEC) codec, new long[] { value }).length);
+		}
+	}
+}
diff --git a/src/test/java/me/lemire/longcompression/synth/LongClusteredDataGenerator.java b/src/test/java/me/lemire/longcompression/synth/LongClusteredDataGenerator.java
new file mode 100644
index 0000000..c964f6f
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/synth/LongClusteredDataGenerator.java
@@ -0,0 +1,91 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+package me.lemire.longcompression.synth;
+
+import me.lemire.integercompression.synth.ClusteredDataGenerator;
+
+/**
+ * This class will generate lists of random longs based on the clustered
+ * model:
+ * 
+ * Reference: Vo Ngoc Anh and Alistair Moffat. 2010. Index compression using
+ * 64-bit words. Softw. Pract. Exper.40, 2 (February 2010), 131-147.
+ * 
+ * @author Benoit Lacelle
+ * @see ClusteredDataGenerator
+ */
+public class LongClusteredDataGenerator {
+
+        final LongUniformDataGenerator unidg = new LongUniformDataGenerator();
+
+        /**
+         * Creating random array generator.
+         */
+        public LongClusteredDataGenerator() {
+        }
+
+        void fillUniform(long[] array, int offset, int length, long Min, long Max) {
+                long[] v = this.unidg.generateUniform(length, Max - Min);
+                for (int k = 0; k < v.length; ++k)
+                        array[k + offset] = Min + v[k];
+        }
+
+        void fillClustered(long[] array, int offset, int length, long Min, long Max) {
+                final long range = Max - Min;
+                if ((range == length) || (length <= 10)) {
+                        fillUniform(array, offset, length, Min, Max);
+                        return;
+                }
+                final long cut = length
+                        / 2
+                        + ((range - length - 1 > 0) ? (long)this.unidg.rand
+                                .nextDouble() * (range - length - 1) : 0);
+                final double p = this.unidg.rand.nextDouble();
+                if (p < 0.25) {
+                        fillUniform(array, offset, length / 2, Min, Min + cut);
+                        fillClustered(array, offset + length / 2, length
+                                - length / 2, Min + cut, Max);
+                } else if (p < 0.5) {
+                        fillClustered(array, offset, length / 2, Min, Min + cut);
+                        fillUniform(array, offset + length / 2, length - length
+                                / 2, Min + cut, Max);
+                } else {
+                        fillClustered(array, offset, length / 2, Min, Min + cut);
+                        fillClustered(array, offset + length / 2, length
+                                - length / 2, Min + cut, Max);
+                }
+        }
+
+        /**
+         * generates randomly N distinct integers from 0 to Max.
+         * 
+         * @param N
+         *                number of integers to generate
+         * @param Max
+         *                maximal value of the integers
+         * @return array containing the integers
+         */
+        public long[] generateClustered(int N, long Max) {
+                long[] array = new long[N];
+                fillClustered(array, 0, N, 0, Max);
+                return array;
+        }
+
+        /**
+         * Little test program.
+         * 
+         * @param args
+         *                arguments are ignored
+         */
+        public static void main(final String[] args) {
+                long[] example = (new LongClusteredDataGenerator())
+                        .generateClustered(20, 1000);
+                for (int k = 0; k < example.length; ++k)
+                        System.out.println(example[k]);
+        }
+
+}
diff --git a/src/test/java/me/lemire/longcompression/synth/LongUniformDataGenerator.java b/src/test/java/me/lemire/longcompression/synth/LongUniformDataGenerator.java
new file mode 100644
index 0000000..4aa797b
--- /dev/null
+++ b/src/test/java/me/lemire/longcompression/synth/LongUniformDataGenerator.java
@@ -0,0 +1,125 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+package me.lemire.longcompression.synth;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Random;
+import java.util.Set;
+
+import org.roaringbitmap.longlong.Roaring64Bitmap;
+
+import me.lemire.integercompression.synth.UniformDataGenerator;
+
+/**
+ * This class will generate "uniform" lists of random longs.
+ * 
+ * @author Benoit Lacelle
+ * @see UniformDataGenerator
+ */
+public class LongUniformDataGenerator {
+        /**
+         * construct generator of random arrays.
+         */
+        public LongUniformDataGenerator() {
+                this.rand = new Random();
+        }
+
+        /**
+         * @param seed
+         *                random seed
+         */
+        public LongUniformDataGenerator(final int seed) {
+                this.rand = new Random(seed);
+        }
+
+        /**
+         * generates randomly N distinct longs from 0 to Max.
+         */
+        long[] generateUniformHash(int N, long Max) {
+                if (N > Max)
+                        throw new RuntimeException("not possible");
+                long[] ans = new long[N];
+                Set<Long> s = new HashSet<>();
+                while (s.size() < N)
+                        s.add((long) (this.rand.nextDouble() * Max));
+                Iterator<Long> i = s.iterator();
+                for (int k = 0; k < N; ++k)
+                        ans[k] = i.next().longValue();
+                Arrays.sort(ans);
+                return ans;
+        }
+
+        /**
+         * output all longs from the range [0,Max) that are not in the array
+         */
+        static long[] negate(long[] x, long Max) {
+            int newLength = saturatedCast(Max - x.length);
+            long[] ans = new long[newLength];
+                int i = 0;
+                int c = 0;
+                for (int j = 0; j < x.length; ++j) {
+                    long v = x[j];
+                        for (; i < v; ++i)
+                                ans[c++] = i;
+                        ++i;
+                }
+                while (c < ans.length)
+                        ans[c++] = i++;
+                return ans;
+        }
+
+        private static int saturatedCast(long toInt) {
+            if (toInt > Integer.MAX_VALUE) {
+                return Integer.MAX_VALUE;
+            } else {
+                return (int) toInt;
+            }
+        }
+
+        /**
+         * generates randomly N distinct longs from 0 to Max.
+         * 
+         * @param N
+         *                number of longs to generate
+         * @param Max
+         *                bound on the value of longs
+         * @return an array containing randomly selected longs
+         */
+        public long[] generateUniform(int N, long Max) {
+                assert N >= 0;
+                assert Max >= 0;
+                if (N * 2 > Max) {
+                        return negate(generateUniform(saturatedCast(Max - N), Max), Max);
+                }
+                if (2048 * N > Max)
+                        return generateUniformBitmap(N, Max);
+                return generateUniformHash(N, Max);
+        }
+
+        /**
+         * generates randomly N distinct longs from 0 to Max.
+         */
+        long[] generateUniformBitmap(int N, long Max) {
+                if (N > Max)
+                        throw new RuntimeException("not possible");
+                Roaring64Bitmap bs = new Roaring64Bitmap();
+                int cardinality = 0;
+                while (cardinality < N) {
+                        long v = (long) (rand.nextDouble() * Max);
+                        if (!bs.contains(v)) {
+                                bs.add(v);
+                                cardinality++;
+                        }
+                }
+                return bs.toArray();
+        }
+
+        Random rand = new Random();
+
+}
\ No newline at end of file