diff --git a/.github/release-settings.xml b/.github/release-settings.xml
new file mode 100644
index 0000000..be56a53
--- /dev/null
+++ b/.github/release-settings.xml
@@ -0,0 +1,20 @@
+
+
+
+ eu.maveniverse.maven.plugins
+
+
+
+
+ sonatype-central-portal
+ ${env.MAVEN_USER}
+ ${env.MAVEN_PASSWORD}
+
+ sonatype-cp
+ njord:template:release-sca
+
+
+
+
+
diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml
new file mode 100644
index 0000000..7f12ed7
--- /dev/null
+++ b/.github/workflows/basic.yml
@@ -0,0 +1,24 @@
+name: Java CI
+
+on: [push, pull_request]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ java: [ 21 ]
+ steps:
+ - uses: actions/checkout@v4.1.1
+ - name: Set up JDK
+ uses: actions/setup-java@v4.1.0
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'adopt'
+ - name: Build and test with Maven
+ run: mvn package
+ - name: Build example
+ run: javac -cp target/classes/:. example.java
+ - name: Run example
+ run: java -cp target/classes/:. example
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..d6ad167
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,90 @@
+name: Release
+
+on:
+ workflow_dispatch:
+ inputs:
+ releaseVersion:
+ description: "Release version, e.g. 0.3.6 (optional — auto-detected from the current POM)"
+ required: false
+
+jobs:
+ release:
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write # to automatically create tags
+
+ steps:
+ - name: Validate release version
+ if: ${{ github.event.inputs.releaseVersion != '' }}
+ run: |
+ RELEASE=${{ github.event.inputs.releaseVersion }}
+ if [[ ! $RELEASE =~ ^[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?$ ]]; then
+ echo "Error: releaseVersion '$RELEASE' is not in the correct format x.y.z or x.y.z-SNAPSHOT"
+ exit 1
+ fi
+
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ ref: master
+
+ - name: Set up Java
+ uses: actions/setup-java@v4
+ with:
+ java-version: '21'
+ distribution: 'adopt'
+ gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
+ gpg-passphrase: MAVEN_GPG_PASSPHRASE
+
+ - name: Configure git
+ run: |
+ git config user.email "actions@github.com"
+ git config user.name "GitHub Actions"
+
+ - name: Prepare Release
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+ run: |
+ MVN_ARGS=""
+ if [ -n "${{ github.event.inputs.releaseVersion }}" ]; then
+ MVN_ARGS="$MVN_ARGS -DreleaseVersion=${{ github.event.inputs.releaseVersion }}"
+ fi
+ mvn -B release:prepare $MVN_ARGS
+
+ - name: Check release.properties
+ run: |
+ if [ ! -f release.properties ]; then
+ echo "release.properties not found"
+ exit 1
+ fi
+ echo "Contents of release.properties:"
+ cat release.properties
+
+ - name: Determine release version
+ id: version
+ run: |
+ export TAG=$(grep 'scm.tag=' release.properties | cut -d'=' -f2)
+ export VERSION=${TAG#JavaFastPFOR-}
+
+ echo "released_tag=${TAG}" >> $GITHUB_OUTPUT
+ echo "released_version=${VERSION}" >> $GITHUB_OUTPUT
+
+ echo "Releasing tag: ${TAG}"
+ echo "Releasing version: ${VERSION}"
+
+ - name: Release
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+ MAVEN_GPG_KEY: ${{ secrets.GPG_PRIVATE_KEY }}
+ MAVEN_USER: ${{ secrets.MAVEN_USER }}
+ MAVEN_PASSWORD: ${{ secrets.MAVEN_PASSWORD }}
+ run: |
+ mvn -B release:perform -Darguments="-DskipTests -DaltDeploymentRepository=id::default::njord: -Dnjord.autoPublish=true -Dnjord.publishingType=automatic" -s .github/release-settings.xml
+
+ - name: Create GitHub Release
+ run: gh release create "${{ steps.version.outputs.released_tag }}" --generate-notes --title "Version ${{ steps.version.outputs.released_version }}"
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 53960d2..5a78c84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
.classpath
+.settings
.project
*.class
*.csv
@@ -6,3 +7,5 @@ tags
target/
tmp/
/bin
+.idea
+*.iml
diff --git a/.travis.yml b/.travis.yml
index a159cc4..427b879 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,10 +1,12 @@
language: java
jdk:
- - oraclejdk7
- - openjdk7
- - openjdk6
+ - oraclejdk8
+ - oraclejdk9
install: true
-script: mvn test
+script: mvn clean test jacoco:report
+
+after_success:
+ - mvn coveralls:report
diff --git a/CHANGELOG b/CHANGELOG
index 75c5404..1377346 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,18 @@
+0.1.10 (October 7th 2016)
+ - Fix for issue 36 https://github.com/lemire/JavaFastPFOR/issues/36
+
+0.1.9 (September 14th 2016)
+ - Tuning GroupSimple9
+
+0.1.8 (September 13th 2016)
+ - Added GroupSimple9
+
+0.1.7 (April 22nd 2016)
+ - Documentation fix.
+
+0.1.4, 0.1.5, 0.1.6 (November 25th 2015)
+ - Added IntCompressor and IntegratedIntCompressor for users looking for a simpler API
+
0.1.3 (June 19th 2015)
- Fixed issue #29: Composition not working properly when output offset != 0 (Saúl Vargas)
diff --git a/README.md b/README.md
index b29cd58..0246789 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,9 @@
JavaFastPFOR: A simple integer compression library in Java
==========================================================
+[](https://jitpack.io/#fast-pack/JavaFastPFor) [![][license img]][license] [![docs-badge][]][docs]
+[](https://github.com/lemire/JavaFastPFOR/actions/workflows/basic.yml)
-[](https://travis-ci.org/lemire/JavaFastPFOR) [![][maven img]][maven] [![][license img]][license] [![docs-badge][]][docs]
-License
--------
-
-This code is released under the
-Apache License Version 2.0 http://www.apache.org/licenses/.
What does this do?
@@ -15,20 +11,27 @@ What does this do?
It is a library to compress and uncompress arrays of integers
very fast. The assumption is that most (but not all) values in
-your array use less than 32 bits. These sort of arrays often come up
+your array use much less than 32 bits, or that the gaps between
+the integers use much less than 32 bits. These sort of arrays often come up
when using differential coding in databases and information
retrieval (e.g., in inverted indexes or column stores).
-It can decompress integers at a rate of over 1.2 billions per second
+Please note that random integers are not compressible, by this
+library or by any other means. If you ever had the means of
+systematically compressing random integers, you could compress
+any data source to nothing, by recursive application of your technique.
+
+This library can decompress integers at a rate of over 1.2 billions per second
(4.5 GB/s). It is significantly faster than generic codecs (such
as Snappy, LZ4 and so on) when compressing arrays of integers.
+The library is used in [LinkedIn Pinot](https://github.com/linkedin/pinot), a realtime distributed OLAP datastore.
Part of this library has been integrated in Parquet (http://parquet.io/).
A modified version of the library is included in the search engine
Terrier (http://terrier.org/). This libary is used by ClueWeb
-Tools (https://github.com/lintool/clueweb).
+Tools (https://github.com/lintool/clueweb). It is also used by [Apache NiFi](https://nifi.apache.org).
-This library inspired a compression scheme used by Apache Lucene (e.g., see
+This library inspired a compression scheme used by Apache Lucene and Apache Lucene.NET (e.g., see
http://lucene.apache.org/core/4_6_1/core/org/apache/lucene/util/PForDeltaDocIdSet.html ).
It is a java port of the fastpfor C++ library (https://github.com/lemire/FastPFor).
@@ -40,58 +43,149 @@ as well as in GMAP and GSNAP (http://research-pub.gene.com/gmap/).
Usage
------
-See example.java.
+
+```java
+package org.example;
+
+import me.lemire.integercompression.FastPFOR128;
+import me.lemire.integercompression.IntWrapper;
+
+import java.util.Arrays;
+
+public class Main {
+ public static void main(String[] args) {
+ FastPFOR128 fastpfor = new FastPFOR128();
+
+ int N = 9984;
+ int[] data = new int[N];
+ for (var i = 0; i < N; i += 150) {
+ data[i] = i;
+ }
+
+ int[] compressedoutput1 = new int[N + 1024];
+
+ IntWrapper inputoffset1 = new IntWrapper(0);
+ IntWrapper outputoffset1 = new IntWrapper(0);
+
+ fastpfor.compress(data, inputoffset1, N, compressedoutput1, outputoffset1);
+ int compressedsize1 = outputoffset1.get();
+
+ int[] recovered1 = new int[N];
+ inputoffset1 = new IntWrapper(0);
+ outputoffset1 = new IntWrapper(0);
+ fastpfor.uncompress(compressedoutput1, outputoffset1, compressedsize1, recovered1, inputoffset1);
+
+ // quick verification: count mismatches
+ int mismatches = 0;
+ for (int i = 0; i < N; i++) {
+ if (data[i] != recovered1[i]) mismatches++;
+ }
+
+ System.out.println("N=" + N + " compressedSizeWords=" + compressedsize1 + " mismatches=" + mismatches);
+ System.out.println("first 20 original: " + Arrays.toString(Arrays.copyOf(data, 20)));
+ System.out.println("first 20 recovered: " + Arrays.toString(Arrays.copyOf(recovered1, 20)));
+ }
+}
+
+```
+
+For more examples, see example.java or the examples folder.
+
+JavaFastPFOR supports compressing and uncompressing data in chunks (e.g., see ``advancedExample`` in [https://github.com/lemire/JavaFastPFOR/blob/master/example.java](example.java)).
Some CODECs ("integrated codecs") assume that the integers are
in sorted orders and use differential coding (they compress deltas).
-They can be found in the package me.lemire.integercopression.differential.
+They can be found in the package me.lemire.integercompression.differential.
Most others do not.
+The Java Team at Intel (R) introduced the vector implementation for FastPFOR
+based on the Java Vector API that showed significant gains over the
+non-vectorized implementation. For an example usage, see
+examples/vector/Example.java. The feature requires JDK 19+ and is currently for
+advanced users.
-Maven central repository
+JavaFastPFOR as a dependency
------------------------
-Using this code in your own project is easy with maven, just add
-the following code in your pom.xml file:
+JavaFastPFOR is available both on Maven Central and JitPack, so you can easily
+include it in your project using either source.
+We have a demo project using JavaFastPFOR as a dependency (both Maven and Gradle). See...
-
-
- me.lemire.integercompression
- JavaFastPFOR
- 0.1.3
-
-
+https://github.com/fast-pack/JavaFastPFORDemo
-Naturally, you should replace "version" by the version
-you desire.
+### Maven Central
+You can add JavaFastPFOR directly from Maven Central — no extra repository configuration needed:
+**Maven**
-You can also download JavaFastPFOR from the Maven central repository:
-http://repo1.maven.org/maven2/me/lemire/integercompression/JavaFastPFOR/
+```xml
+
+ me.lemire.integercompression
+ JavaFastPFOR
+ 0.3.8
+
+```
+**Gradle (Groovy)**
-Why?
-----
+```groovy
+dependencies {
+ implementation 'me.lemire.integercompression:JavaFastPFOR:0.3.8'
+}
+```
+
+### JitPack
+
+If you prefer or need to use JitPack, you can include the dependency like this:
-We found no library that implemented state-of-the-art integer coding techniques
-such as Binary Packing, NewPFD, OptPFD, Variable Byte, Simple 9 and so on in Java.
-We wrote one.
+**Maven**
+```xml
+
+
+ jitpack.io
+ https://jitpack.io
+
+
-Authors
--------
+
+ com.github.fast-pack
+ JavaFastPFOR
+ JavaFastPFOR-0.3.8
+
+```
-Main contributors
-* Daniel Lemire, http://lemire.me/en/
-* Muraoka Taro, https://github.com/koron
+**Gradle (groovy)**
-with contributions by
-* the Terrier team (Matteo Catena, Craig Macdonald, Saúl Vargas and Iadh Ounis)
-* Di Wu, http://www.facebook.com/diwu1989
-* Stefan Ackermann, https://github.com/Stivo
-* Samit Roy, https://github.com/roysamit
+```groovy
+repositories {
+ mavenCentral()
+ maven {
+ url 'https://jitpack.io'
+ }
+}
+
+dependencies {
+ implementation 'com.github.fast-pack:JavaFastPFOR:JavaFastPFOR-0.3.8'
+}
+```
+
+Naturally, you should replace "version" by the version
+you desire.
+
+
+Thread safety
+----
+
+Some codecs are thread-safe while others are not.
+For this reason, it is best to use one codec per thread.
+The memory usage of a codec instance is small in any case.
+
+Nevertheless, if you want to reuse codec instances,
+note that by convention, unless the documentation of a codec specify
+that it is not thread-safe, then it can be assumed to be thread-safe.
How does it compare to the Kamikaze PForDelta library?
------------------------------------------------------
@@ -106,22 +200,20 @@ Reference:
http://sna-projects.com/kamikaze/
+
Requirements
------------
-A recent Java compiler. Java 7 or better is recommended.
+Releases up to 0.1.12 require Java 7 or better.
-Good instructions on installing Java 7 on Linux:
+The current development versions assume JDK 21 or better.
-http://forums.linuxmint.com/viewtopic.php?f=42&t=93052
How fast is it?
---------------
-Compile the code and execute me.lemire.integercompression.benchmarktools.Benchmark.
-
-I recommend running all the benchmarks with the "-server" flag on a desktop machine.
+Compile the code and execute `me.lemire.integercompression.benchmarktools.Benchmark`.
Speed is always reported in millions of integers per second.
@@ -129,11 +221,21 @@ Speed is always reported in millions of integers per second.
For Maven users
---------------
-mvn compile
+```
+mvn compile
mvn exec:java
+```
+
+You may run our examples as follows:
-For ant users
+```
+mvn package
+javac -cp target/classes/:. example.java
+java -cp target/classes/:. example
+```
+
+For ant users (legacy, currently untested)
-------------
If you use Apache ant, please try this:
@@ -148,7 +250,22 @@ or:
API Documentation
-----------------
-http://lemire.me/docs/javafastpfor/
+http://www.javadoc.io/doc/me.lemire.integercompression/JavaFastPFOR/
+
+
+Citing this work
+-----------------
+
+If you use JavaFastPFOR in your work, please consider citing the project. A recommended BibTeX entry is:
+
+```bibtex
+@misc{lemire2025_javafastpfor,
+ author = {Daniel Lemire},
+ title = {{JavaFastPFOR: A simple integer compression library in Java}},
+ year = {2025},
+ howpublished = {\url{https://github.com/fast-pack/JavaFastPFOR}},
+}
+```
Want to read more?
------------------
@@ -160,7 +277,8 @@ http://dx.doi.org/10.1007/978-3-319-06028-6_30
We wrote several research papers documenting many of the CODECs implemented here:
-* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience (to appear) http://arxiv.org/abs/1401.6399
+* Daniel Lemire, Nathan Kurz, Christoph Rupp, Stream VByte: Faster Byte-Oriented Integer Compression, Information Processing Letters (to appear) https://arxiv.org/abs/1709.08990
+* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience Volume 46, Issue 6, pages 723-749, June 2016 http://arxiv.org/abs/1401.6399
* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
@@ -169,10 +287,17 @@ We wrote several research papers documenting many of the CODECs implemented here
Ikhtear Sharif wrote his M.Sc. thesis on this library:
Ikhtear Sharif, Performance Evaluation of Fast Integer Compression Techniques Over Tables, M.Sc. thesis, UNB 2013.
-http://lemire.me/fr/documents/thesis/IkhtearThesis.pdf
+https://unbscholar.lib.unb.ca/islandora/object/unbscholar%3A9399/datastream/PDF/view
He also posted his slides online: http://www.slideshare.net/ikhtearSharif/ikhtear-defense
+Other recommended libraries
+-----------------------------
+
+* Fast integer compression in Go: https://github.com/ronanh/intcomp
+* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
+* CSharpFastPFOR: A C# integer compression library https://github.com/Genbox/CSharpFastPFOR
+* TurboPFor is a C library that offers lots of interesting optimizations and Java wrappers. Well worth checking! (Uses a GPL license.) https://github.com/powturbo/TurboPFor
Funding
-----------
@@ -180,11 +305,9 @@ Funding
This work was supported by NSERC grant number 26143.
-[maven img]:https://maven-badges.herokuapp.com/maven-central/me.lemire.integercompression/JavaFastPFOR/badge.svg
-[maven]:http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22me.lemire.integercompression%22%20
[license]:LICENSE
[license img]:https://img.shields.io/badge/License-Apache%202-blue.svg
[docs-badge]:https://img.shields.io/badge/API-docs-blue.svg?style=flat-square
-[docs]:http://lemire.me/docs/javafastpfor/
+[docs]:http://www.javadoc.io/doc/me.lemire.integercompression/JavaFastPFOR/
diff --git a/benchmarkresults/benchmarkresults_haswell_18sept2014.txt b/benchmarkresults/benchmarkresults_haswell_18sept2014.txt
index 43fa98b..a501d5d 100644
--- a/benchmarkresults/benchmarkresults_haswell_18sept2014.txt
+++ b/benchmarkresults/benchmarkresults_haswell_18sept2014.txt
@@ -1,7 +1,7 @@
# benchmark based on the ClusterData model from:
-# Vo Ngoc Anh and Alistair Moffat.
-# Index compression using 64-bit words.
-# Softw. Pract. Exper.40, 2 (February 2010), 131-147.
+# Vo Ngoc Anh and Alistair Moffat.
+# Index compression using 64-bit words.
+# Softw. Pract. Exper.40, 2 (February 2010), 131-147.
# Results will be written into a CSV file: benchmark-20140918T011257.csv
@@ -10,852 +10,852 @@
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.56 246 1061
+ 2.56 246 1061
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 3.21 66 275
+ 3.21 66 275
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.96 838 1679
+ 2.96 838 1679
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1276 1805
+ 32.00 1276 1805
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 490 509
+ 8.00 490 509
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 582 774
+ 8.00 582 774
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.96 765 1193
+ 2.96 765 1193
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.88 139 896
+ 2.88 139 896
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.90 166 905
+ 2.90 166 905
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.88 139 898
+ 2.88 139 898
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.87 25 938
+ 2.87 25 938
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.90 29 960
+ 2.90 29 960
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.87 25 882
+ 2.87 25 882
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.63 274 1015
+ 2.63 274 1015
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 2.83 280 771
+ 2.83 280 771
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.84 444 837
+ 2.84 444 837
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.27 498 652
+ 3.27 498 652
# sparsity 2
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.51 244 1048
+ 3.51 244 1048
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 4.18 55 247
+ 4.18 55 247
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.93 862 1611
+ 3.93 862 1611
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1286 1816
+ 32.00 1286 1816
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.01 486 508
+ 8.01 486 508
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.01 575 763
+ 8.01 575 763
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.93 774 1159
+ 3.93 774 1159
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.83 118 865
+ 3.83 118 865
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.86 141 875
+ 3.86 141 875
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.83 118 867
+ 3.83 118 867
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 18 881
+ 3.82 18 881
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.85 22 887
+ 3.85 22 887
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 18 838
+ 3.82 18 838
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.58 273 990
+ 3.58 273 990
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 201 656
+ 3.82 201 656
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.90 442 819
+ 3.90 442 819
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.50 494 640
+ 4.50 494 640
# sparsity 3
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.28 244 1030
+ 4.28 244 1030
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 4.95 51 247
+ 4.95 51 247
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.71 850 1577
+ 4.71 850 1577
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1252 1769
+ 32.00 1252 1769
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 478 504
+ 8.02 478 504
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 573 762
+ 8.02 573 762
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.71 770 1139
+ 4.71 770 1139
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.60 107 850
+ 4.60 107 850
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.63 127 863
+ 4.63 127 863
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.60 107 853
+ 4.60 107 853
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.59 14 865
+ 4.59 14 865
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.62 18 882
+ 4.62 18 882
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.59 14 844
+ 4.59 14 844
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.34 268 969
+ 4.34 268 969
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 4.72 170 610
+ 4.72 170 610
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.68 434 783
+ 4.68 434 783
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.33 472 624
+ 5.33 472 624
# sparsity 4
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.03 239 1004
+ 5.03 239 1004
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 5.73 47 251
+ 5.73 47 251
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.48 846 1556
+ 5.48 846 1556
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1274 1799
+ 32.00 1274 1799
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.12 439 486
+ 8.12 439 486
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.12 537 715
+ 8.12 537 715
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.48 769 1134
+ 5.48 769 1134
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.36 95 817
+ 5.36 95 817
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.39 115 838
+ 5.39 115 838
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.36 96 827
+ 5.36 96 827
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.34 12 842
+ 5.34 12 842
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.37 16 871
+ 5.37 16 871
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.34 12 803
+ 5.34 12 803
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.09 268 963
+ 5.09 268 963
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 5.57 150 587
+ 5.57 150 587
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.47 432 800
+ 5.47 432 800
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.16 491 635
+ 6.16 491 635
# sparsity 5
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.00 236 999
+ 6.00 236 999
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 6.70 43 242
+ 6.70 43 242
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.45 863 1584
+ 6.45 863 1584
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1236 1792
+ 32.00 1236 1792
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.40 369 452
+ 8.40 369 452
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.40 486 617
+ 8.40 486 617
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.45 777 1132
+ 6.45 777 1132
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.33 86 808
+ 6.33 86 808
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.36 103 828
+ 6.36 103 828
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.33 86 813
+ 6.33 86 813
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.31 9 825
+ 6.31 9 825
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.34 13 858
+ 6.34 13 858
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.31 9 819
+ 6.31 9 819
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.06 265 945
+ 6.06 265 945
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 6.65 139 546
+ 6.65 139 546
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.45 442 804
+ 6.45 442 804
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.18 493 635
+ 7.18 493 635
# sparsity 6
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.76 238 998
+ 6.76 238 998
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 7.45 42 251
+ 7.45 42 251
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.20 854 1525
+ 7.20 854 1525
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1177 1663
+ 32.00 1177 1663
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.10 259 362
+ 9.10 259 362
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.10 380 450
+ 9.10 380 450
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.20 718 1098
+ 7.20 718 1098
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.08 79 786
+ 7.08 79 786
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.11 95 821
+ 7.11 95 821
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.08 81 814
+ 7.08 81 814
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.06 8 836
+ 7.06 8 836
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.09 11 860
+ 7.09 11 860
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.06 8 822
+ 7.06 8 822
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.81 268 962
+ 6.81 268 962
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 7.56 129 509
+ 7.56 129 509
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.19 433 789
+ 7.19 433 789
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.93 491 632
+ 7.93 491 632
# sparsity 7
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.05 236 985
+ 8.05 236 985
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 8.75 39 247
+ 8.75 39 247
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.50 861 1526
+ 8.50 861 1526
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1279 1788
+ 32.00 1279 1788
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.11 190 305
+ 10.11 190 305
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.11 311 355
+ 10.11 311 355
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.50 753 1092
+ 8.50 753 1092
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.37 71 792
+ 8.37 71 792
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.40 83 804
+ 8.40 83 804
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.37 72 805
+ 8.37 72 805
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.35 7 808
+ 8.35 7 808
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.38 10 835
+ 8.38 10 835
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.35 7 796
+ 8.35 7 796
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.10 259 920
+ 8.10 259 920
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 9.16 111 447
+ 9.16 111 447
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.52 435 784
+ 8.52 435 784
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.32 485 622
+ 9.32 485 622
# sparsity 8
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.73 234 972
+ 8.73 234 972
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 9.44 37 250
+ 9.44 37 250
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.19 848 1493
+ 9.19 848 1493
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1279 1858
+ 32.00 1279 1858
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.04 167 307
+ 11.04 167 307
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.04 309 353
+ 11.04 309 353
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.19 751 1095
+ 9.19 751 1095
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.06 67 770
+ 9.06 67 770
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.09 78 781
+ 9.09 78 781
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.06 68 792
+ 9.06 68 792
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.03 6 795
+ 9.03 6 795
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.07 9 824
+ 9.07 9 824
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.03 6 787
+ 9.03 6 787
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.78 266 936
+ 8.78 266 936
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 10.34 101 427
+ 10.34 101 427
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.21 437 794
+ 9.21 437 794
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.01 488 626
+ 10.01 488 626
# sparsity 9
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.75 234 980
+ 9.75 234 980
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 10.48 36 242
+ 10.48 36 242
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.21 844 1474
+ 10.21 844 1474
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1262 1795
+ 32.00 1262 1795
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.18 145 300
+ 12.18 145 300
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.18 302 340
+ 12.18 302 340
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.21 761 1096
+ 10.21 761 1096
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.08 63 786
+ 10.08 63 786
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.12 72 752
+ 10.12 72 752
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.08 63 783
+ 10.08 63 783
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.05 6 787
+ 10.05 6 787
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.09 8 798
+ 10.09 8 798
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.05 6 779
+ 10.05 6 779
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.80 264 930
+ 9.80 264 930
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 11.77 92 410
+ 11.77 92 410
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.23 438 789
+ 10.23 438 789
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.05 486 624
+ 11.05 486 624
# sparsity 10
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.73 235 979
+ 10.73 235 979
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 11.46 35 239
+ 11.46 35 239
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.18 840 1456
+ 11.18 840 1456
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1248 1746
+ 32.00 1248 1746
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.14 135 312
+ 13.14 135 312
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.14 309 354
+ 13.14 309 354
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.18 761 1097
+ 11.18 761 1097
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.09 59 802
+ 11.09 59 802
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.13 69 814
+ 11.13 69 814
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.09 59 771
+ 11.09 59 771
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.04 5 783
+ 11.04 5 783
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.08 8 816
+ 11.08 8 816
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.04 5 776
+ 11.04 5 776
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.78 265 934
+ 10.78 265 934
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 12.98 89 415
+ 12.98 89 415
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.20 436 787
+ 11.20 436 787
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.02 483 620
+ 12.02 483 620
# sparsity 11
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.79 232 950
+ 11.79 232 950
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 12.68 34 256
+ 12.68 34 256
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.23 842 1450
+ 12.23 842 1450
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1292 1826
+ 32.00 1292 1826
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.00 127 324
+ 14.00 127 324
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.00 308 369
+ 14.00 308 369
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.23 760 1092
+ 12.23 760 1092
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.35 56 795
+ 12.35 56 795
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.38 65 829
+ 12.38 65 829
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.35 57 822
+ 12.35 57 822
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.13 5 706
+ 12.13 5 706
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.17 7 750
+ 12.17 7 750
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.13 5 712
+ 12.13 5 712
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.83 261 919
+ 11.83 261 919
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 14.17 85 401
+ 14.17 85 401
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.25 436 781
+ 12.25 436 781
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.08 489 623
+ 13.08 489 623
# sparsity 12
# generating random data...
# generating random data... ok.
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.70 226 932
+ 12.70 226 932
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 13.76 34 261
+ 13.76 34 261
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.16 849 1453
+ 13.16 849 1453
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1208 1804
+ 32.00 1208 1804
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.84 117 307
+ 14.84 117 307
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.84 260 352
+ 14.84 260 352
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.16 762 1095
+ 13.16 762 1095
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.46 56 899
+ 13.46 56 899
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.48 63 915
+ 13.48 63 915
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.46 56 897
+ 13.46 56 897
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.20 5 681
+ 13.20 5 681
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.24 7 735
+ 13.24 7 735
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.20 5 699
+ 13.20 5 699
# IntegratedFastPFOR + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.75 260 914
+ 12.75 260 914
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 15.51 80 359
+ 15.51 80 359
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.18 435 781
+ 13.18 435 781
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.00 489 626
+ 14.00 489 626
Results were written into a CSV file: benchmark-20140918T011257.csv
diff --git a/benchmarkresults/benchmarkresults_icore7_10may2013.txt b/benchmarkresults/benchmarkresults_icore7_10may2013.txt
index 5b776fb..d10579e 100644
--- a/benchmarkresults/benchmarkresults_icore7_10may2013.txt
+++ b/benchmarkresults/benchmarkresults_icore7_10may2013.txt
@@ -3,610 +3,610 @@
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 3.34 51 262
+ 3.34 51 262
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.09 639 1183
+ 3.09 639 1183
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1151 1468
+ 32.00 1151 1468
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 199 299
+ 8.00 199 299
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 148 339
+ 8.00 148 339
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.09 613 920
+ 3.09 613 920
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.02 143 721
+ 3.02 143 721
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.05 164 705
+ 3.05 164 705
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.01 26 790
+ 3.01 26 790
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.03 30 816
+ 3.03 30 816
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.78 226 811
+ 2.78 226 811
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 2.97 236 604
+ 2.97 236 604
# sparsity 2
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 4.17 47 266
+ 4.17 47 266
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.92 672 1261
+ 3.92 672 1261
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1218 1562
+ 32.00 1218 1562
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 204 290
+ 8.00 204 290
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 236 343
+ 8.00 236 343
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.92 505 917
+ 3.92 505 917
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 127 698
+ 3.82 127 698
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.85 151 726
+ 3.85 151 726
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.81 18 752
+ 3.81 18 752
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.84 23 779
+ 3.84 23 779
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.56 228 828
+ 3.56 228 828
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 182 562
+ 3.82 182 562
# sparsity 3
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 4.96 43 276
+ 4.96 43 276
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.72 662 1187
+ 4.72 662 1187
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1215 1566
+ 32.00 1215 1566
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 198 286
+ 8.02 198 286
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 254 340
+ 8.02 254 340
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.72 576 848
+ 4.72 576 848
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.61 111 654
+ 4.61 111 654
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.64 129 699
+ 4.64 129 699
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.60 14 732
+ 4.60 14 732
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.63 18 761
+ 4.63 18 761
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.36 226 813
+ 4.36 226 813
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 4.76 148 511
+ 4.76 148 511
# sparsity 4
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 5.97 39 270
+ 5.97 39 270
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.72 656 1148
+ 5.72 656 1148
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1212 1555
+ 32.00 1212 1555
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.09 206 287
+ 8.09 206 287
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.09 213 334
+ 8.09 213 334
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.72 626 891
+ 5.72 626 891
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.60 105 672
+ 5.60 105 672
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.63 121 701
+ 5.63 121 701
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.58 10 667
+ 5.58 10 667
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.62 14 736
+ 5.62 14 736
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.33 226 805
+ 5.33 226 805
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 5.86 123 464
+ 5.86 123 464
# sparsity 5
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 6.49 39 262
+ 6.49 39 262
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.25 659 1121
+ 6.25 659 1121
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1154 1168
+ 32.00 1154 1168
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.44 192 265
+ 8.44 192 265
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.44 240 297
+ 8.44 240 297
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.25 631 907
+ 6.25 631 907
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.13 101 685
+ 6.13 101 685
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.16 116 714
+ 6.16 116 714
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.11 9 708
+ 6.11 9 708
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.14 13 741
+ 6.14 13 741
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.86 225 806
+ 5.86 225 806
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 6.44 120 442
+ 6.44 120 442
# sparsity 6
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 7.64 35 269
+ 7.64 35 269
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.39 654 1111
+ 7.39 654 1111
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1207 1553
+ 32.00 1207 1553
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.06 185 225
+ 9.06 185 225
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.06 166 248
+ 9.06 166 248
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.39 620 888
+ 7.39 620 888
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.26 91 679
+ 7.26 91 679
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.30 104 704
+ 7.30 104 704
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.24 7 704
+ 7.24 7 704
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.28 10 735
+ 7.28 10 735
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.00 221 792
+ 7.00 221 792
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 7.76 106 393
+ 7.76 106 393
# sparsity 7
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 8.66 33 266
+ 8.66 33 266
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.41 675 1165
+ 8.41 675 1165
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1210 1553
+ 32.00 1210 1553
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.10 154 194
+ 10.10 154 194
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.10 176 207
+ 10.10 176 207
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.41 628 896
+ 8.41 628 896
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.27 84 643
+ 8.27 84 643
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.31 95 685
+ 8.31 95 685
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.25 6 693
+ 8.25 6 693
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.29 9 723
+ 8.29 9 723
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 215 773
+ 8.00 215 773
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 9.10 94 357
+ 9.10 94 357
# sparsity 8
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 9.52 32 241
+ 9.52 32 241
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.26 692 1194
+ 9.26 692 1194
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1208 1525
+ 32.00 1208 1525
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.14 138 178
+ 11.14 138 178
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.14 187 190
+ 11.14 187 190
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.26 647 893
+ 9.26 647 893
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.14 79 655
+ 9.14 79 655
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.18 88 684
+ 9.18 88 684
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.11 6 680
+ 9.11 6 680
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.15 8 712
+ 9.15 8 712
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.86 220 778
+ 8.86 220 778
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 10.39 86 330
+ 10.39 86 330
# sparsity 9
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 10.46 31 253
+ 10.46 31 253
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.19 661 1122
+ 10.19 661 1122
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1210 1546
+ 32.00 1210 1546
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.27 126 173
+ 12.27 126 173
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.27 155 181
+ 12.27 155 181
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.19 617 886
+ 10.19 617 886
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.07 73 634
+ 10.07 73 634
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.11 82 669
+ 10.11 82 669
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.04 5 663
+ 10.04 5 663
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.08 7 700
+ 10.08 7 700
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.79 215 757
+ 9.79 215 757
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 11.79 78 325
+ 11.79 78 325
# sparsity 10
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 11.13 30 243
+ 11.13 30 243
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.83 628 1028
+ 10.83 628 1028
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1167 1498
+ 32.00 1167 1498
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.71 152 179
+ 12.71 152 179
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.71 151 187
+ 12.71 151 187
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.83 389 820
+ 10.83 389 820
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.76 72 638
+ 10.76 72 638
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.79 79 683
+ 10.79 79 683
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.69 5 655
+ 10.69 5 655
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.73 7 682
+ 10.73 7 682
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.42 219 767
+ 10.42 219 767
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 12.49 77 332
+ 12.49 77 332
# sparsity 11
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 12.38 29 254
+ 12.38 29 254
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.01 660 1112
+ 12.01 660 1112
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1142 1445
+ 32.00 1142 1445
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.87 143 172
+ 13.87 143 172
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.87 144 178
+ 13.87 144 178
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.01 582 830
+ 12.01 582 830
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.04 64 647
+ 12.04 64 647
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.08 69 649
+ 12.08 69 649
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.91 4 637
+ 11.91 4 637
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.95 7 660
+ 11.95 7 660
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.61 217 766
+ 11.61 217 766
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 13.96 73 313
+ 13.96 73 313
# sparsity 12
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 13.57 29 265
+ 13.57 29 265
# me.lemire.integercompression.IntegratedBinaryPacking+me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.01 611 1012
+ 13.01 611 1012
# me.lemire.integercompression.JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1215 1565
+ 32.00 1215 1565
# me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.73 121 160
+ 14.73 121 160
# me.lemire.integercompression.IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.73 131 166
+ 14.73 131 166
# me.lemire.integercompression.BinaryPacking+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.01 603 832
+ 13.01 603 832
# me.lemire.integercompression.NewPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.26 68 737
+ 13.26 68 737
# me.lemire.integercompression.NewPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.29 72 761
+ 13.29 72 761
# me.lemire.integercompression.OptPFD+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.06 4 591
+ 13.06 4 591
# me.lemire.integercompression.OptPFDS9+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.10 6 625
+ 13.10 6 625
# me.lemire.integercompression.FastPFOR+me.lemire.integercompression.VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.61 213 755
+ 12.61 213 755
# me.lemire.integercompression.Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 15.38 69 281
+ 15.38 69 281
diff --git a/benchmarkresults/benchmarkresults_icore7_12november2013.txt b/benchmarkresults/benchmarkresults_icore7_12november2013.txt
index 07b11b3..795650e 100644
--- a/benchmarkresults/benchmarkresults_icore7_12november2013.txt
+++ b/benchmarkresults/benchmarkresults_icore7_12november2013.txt
@@ -10,9 +10,9 @@ Its dependencies (if any) will NOT be available to the current build.
[INFO] [enforcer:enforce {execution: enforce-maven}]
[INFO] [exec:java {execution: default-cli}]
# benchmark based on the ClusterData model from:
-# Vo Ngoc Anh and Alistair Moffat.
-# Index compression using 64-bit words.
-# Softw. Pract. Exper.40, 2 (February 2010), 131-147.
+# Vo Ngoc Anh and Alistair Moffat.
+# Index compression using 64-bit words.
+# Softw. Pract. Exper.40, 2 (February 2010), 131-147.
# Results will be written into a CSV file: benchmark-20131112T105209.csv
@@ -21,852 +21,852 @@ Its dependencies (if any) will NOT be available to the current build.
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 3.28 48 218
+ 3.28 48 218
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 3.03 623 1205
+ 3.03 623 1205
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1154 1331
+ 32.00 1154 1331
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 508 554
+ 8.00 508 554
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 592 709
+ 8.00 592 709
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.03 596 900
+ 3.03 596 900
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.95 115 701
+ 2.95 115 701
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.98 135 726
+ 2.98 135 726
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.95 116 726
+ 2.95 116 726
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.94 19 761
+ 2.94 19 761
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.97 22 767
+ 2.97 22 767
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.94 19 765
+ 2.94 19 765
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 2.71 219 797
+ 2.71 219 797
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.71 217 813
+ 2.71 217 813
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 2.90 254 599
+ 2.90 254 599
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.92 375 669
+ 2.92 375 669
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.36 394 503
+ 3.36 394 503
# sparsity 2
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 4.09 47 254
+ 4.09 47 254
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 3.84 636 1160
+ 3.84 636 1160
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1152 1264
+ 32.00 1152 1264
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.01 510 551
+ 8.01 510 551
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.01 594 704
+ 8.01 594 704
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.84 602 875
+ 3.84 602 875
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.74 101 673
+ 3.74 101 673
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.77 117 695
+ 3.77 117 695
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.74 101 694
+ 3.74 101 694
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.73 15 725
+ 3.73 15 725
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.76 18 741
+ 3.76 18 741
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.73 15 731
+ 3.73 15 731
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 3.48 215 782
+ 3.48 215 782
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.48 212 789
+ 3.48 212 789
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 3.72 190 530
+ 3.72 190 530
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.76 375 657
+ 3.76 375 657
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.32 392 499
+ 4.32 392 499
# sparsity 3
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 5.03 42 250
+ 5.03 42 250
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 4.77 643 1141
+ 4.77 643 1141
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1149 1337
+ 32.00 1149 1337
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 506 547
+ 8.02 506 547
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 590 698
+ 8.02 590 698
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.77 619 904
+ 4.77 619 904
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.66 89 640
+ 4.66 89 640
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.69 103 672
+ 4.69 103 672
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.66 89 668
+ 4.66 89 668
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.64 12 700
+ 4.64 12 700
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.68 14 712
+ 4.68 14 712
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.64 12 704
+ 4.64 12 704
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 4.39 212 762
+ 4.39 212 762
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.39 209 763
+ 4.39 209 763
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 4.81 146 480
+ 4.81 146 480
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.75 373 646
+ 4.75 373 646
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.40 386 496
+ 5.40 386 496
# sparsity 4
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 5.77 39 245
+ 5.77 39 245
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 5.53 629 1095
+ 5.53 629 1095
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1148 1332
+ 32.00 1148 1332
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.11 482 522
+ 8.11 482 522
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.11 557 655
+ 8.11 557 655
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.53 617 889
+ 5.53 617 889
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.42 82 659
+ 5.42 82 659
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.45 94 684
+ 5.45 94 684
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.42 82 686
+ 5.42 82 686
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.40 10 695
+ 5.40 10 695
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.43 12 715
+ 5.43 12 715
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.40 10 705
+ 5.40 10 705
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 5.16 214 776
+ 5.16 214 776
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.16 211 780
+ 5.16 211 780
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 5.66 128 457
+ 5.66 128 457
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.53 370 645
+ 5.53 370 645
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.23 389 493
+ 6.23 389 493
# sparsity 5
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 6.57 37 248
+ 6.57 37 248
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 6.32 640 1113
+ 6.32 640 1113
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1150 1349
+ 32.00 1150 1349
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.41 416 456
+ 8.41 416 456
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.41 473 548
+ 8.41 473 548
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.32 622 898
+ 6.32 622 898
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.20 75 643
+ 6.20 75 643
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.23 86 668
+ 6.23 86 668
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.20 75 666
+ 6.20 75 666
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.18 8 690
+ 6.18 8 690
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.21 11 705
+ 6.21 11 705
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.18 8 697
+ 6.18 8 697
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 5.93 211 741
+ 5.93 211 741
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.93 208 772
+ 5.93 208 772
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 6.51 118 426
+ 6.51 118 426
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.32 374 639
+ 6.32 374 639
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.05 390 492
+ 7.05 390 492
# sparsity 6
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 7.73 35 242
+ 7.73 35 242
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 7.48 630 1071
+ 7.48 630 1071
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1143 1350
+ 32.00 1143 1350
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.04 328 365
+ 9.04 328 365
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.04 365 415
+ 9.04 365 415
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.48 620 882
+ 7.48 620 882
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.36 67 641
+ 7.36 67 641
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.39 76 668
+ 7.39 76 668
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.36 67 667
+ 7.36 67 667
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.33 7 679
+ 7.33 7 679
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.37 9 695
+ 7.37 9 695
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.33 7 686
+ 7.33 7 686
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 7.09 211 749
+ 7.09 211 749
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.09 208 764
+ 7.09 208 764
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 7.88 101 383
+ 7.88 101 383
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.49 372 630
+ 7.49 372 630
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.27 389 489
+ 8.27 389 489
# sparsity 7
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 8.46 33 244
+ 8.46 33 244
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 8.21 628 1052
+ 8.21 628 1052
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1148 1334
+ 32.00 1148 1334
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.01 257 290
+ 10.01 257 290
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.01 284 315
+ 10.01 284 315
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.21 612 859
+ 8.21 612 859
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.08 63 626
+ 8.08 63 626
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.11 71 665
+ 8.11 71 665
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.08 63 663
+ 8.08 63 663
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.06 6 675
+ 8.06 6 675
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.09 8 687
+ 8.09 8 687
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.06 6 682
+ 8.06 6 682
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 7.81 210 756
+ 7.81 210 756
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 7.81 207 759
+ 7.81 207 759
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 8.85 92 353
+ 8.85 92 353
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.22 369 622
+ 8.22 369 622
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.00 389 486
+ 9.00 389 486
# sparsity 8
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 9.41 32 234
+ 9.41 32 234
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 9.16 636 1062
+ 9.16 636 1062
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1151 1326
+ 32.00 1151 1326
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.08 231 269
+ 11.08 231 269
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.08 259 288
+ 11.08 259 288
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.16 616 873
+ 9.16 616 873
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.04 59 638
+ 9.04 59 638
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.07 65 664
+ 9.07 65 664
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.04 59 664
+ 9.04 59 664
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.01 5 665
+ 9.01 5 665
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.05 7 680
+ 9.05 7 680
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.01 5 671
+ 9.01 5 671
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 8.77 209 746
+ 8.77 209 746
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.77 207 738
+ 8.77 207 738
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 10.25 81 324
+ 10.25 81 324
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.18 372 625
+ 9.18 372 625
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.98 387 479
+ 9.98 387 479
# sparsity 9
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 10.41 31 238
+ 10.41 31 238
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 10.15 637 1070
+ 10.15 637 1070
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1145 1413
+ 32.00 1145 1413
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.15 221 267
+ 12.15 221 267
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.15 252 284
+ 12.15 252 284
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.15 609 849
+ 10.15 609 849
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.03 54 624
+ 10.03 54 624
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.06 60 650
+ 10.06 60 650
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.03 54 649
+ 10.03 54 649
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.00 5 653
+ 10.00 5 653
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.03 7 666
+ 10.03 7 666
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.00 5 659
+ 10.00 5 659
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 9.75 207 739
+ 9.75 207 739
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.75 206 743
+ 9.75 206 743
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 11.72 73 313
+ 11.72 73 313
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.17 369 611
+ 10.17 369 611
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.98 381 474
+ 10.98 381 474
# sparsity 10
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 11.57 29 236
+ 11.57 29 236
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 11.28 626 1033
+ 11.28 626 1033
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1141 1328
+ 32.00 1141 1328
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.18 219 276
+ 13.18 219 276
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.18 254 294
+ 13.18 254 294
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.28 610 848
+ 11.28 610 848
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.19 50 617
+ 11.19 50 617
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.23 56 638
+ 11.23 56 638
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.19 50 640
+ 11.19 50 640
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.13 4 640
+ 11.13 4 640
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.17 6 655
+ 11.17 6 655
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.13 4 647
+ 11.13 4 647
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 10.87 207 736
+ 10.87 207 736
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.87 204 734
+ 10.87 204 734
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 13.19 68 311
+ 13.19 68 311
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.30 369 612
+ 11.30 369 612
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.13 386 477
+ 12.13 386 477
# sparsity 11
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 12.41 29 229
+ 12.41 29 229
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 12.01 634 1046
+ 12.01 634 1046
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1148 1365
+ 32.00 1148 1365
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.84 208 261
+ 13.84 208 261
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.84 241 277
+ 13.84 241 277
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.01 605 832
+ 12.01 605 832
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.07 49 650
+ 12.07 49 650
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.10 54 674
+ 12.10 54 674
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.07 49 675
+ 12.07 49 675
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.93 4 604
+ 11.93 4 604
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.97 6 618
+ 11.97 6 618
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.93 4 611
+ 11.93 4 611
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 11.60 206 724
+ 11.60 206 724
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.60 203 724
+ 11.60 203 724
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 13.98 66 291
+ 13.98 66 291
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.04 367 603
+ 12.04 367 603
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.86 385 478
+ 12.86 385 478
# sparsity 12
# generating random data...
# generating random data... ok.
# kamikaze PForDelta
# bits per int, compress speed (mis), decompression speed (mis)
- 13.48 28 236
+ 13.48 28 236
# IntegratedBinaryPacking + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 12.96 634 1051
+ 12.96 634 1051
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1150 1307
+ 32.00 1150 1307
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.69 202 258
+ 14.69 202 258
# IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.69 235 272
+ 14.69 235 272
# BinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.96 610 849
+ 12.96 610 849
# NewPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.17 48 698
+ 13.17 48 698
# NewPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.20 52 714
+ 13.20 52 714
# NewPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.17 48 720
+ 13.17 48 720
# OptPFD + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.96 4 588
+ 12.96 4 588
# OptPFDS9 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.00 6 602
+ 13.00 6 602
# OptPFDS16 + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.96 4 597
+ 12.96 4 597
# IntegratedFastPFOR + IntegratedVariableByte (Integrated)
# bits per int, compress speed (mis), decompression speed (mis)
- 12.55 206 726
+ 12.55 206 726
# FastPFOR + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.55 203 725
+ 12.55 203 725
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 15.40 63 269
+ 15.40 63 269
# XorBinaryPacking + VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.99 368 613
+ 12.99 368 613
# DeltaZigzagBinaryPacking + DeltaZigzagVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.81 384 476
+ 13.81 384 476
Results were written into a CSV file: benchmark-20131112T105209.csv
diff --git a/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt b/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt
index 4159637..7e35696 100644
--- a/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt
+++ b/benchmarkresults/benchmarkresults_skippable_haswell_18sept2014.txt
@@ -1,7 +1,7 @@
# benchmark based on the ClusterData model from:
-# Vo Ngoc Anh and Alistair Moffat.
-# Index compression using 64-bit words.
-# Softw. Pract. Exper.40, 2 (February 2010), 131-147.
+# Vo Ngoc Anh and Alistair Moffat.
+# Index compression using 64-bit words.
+# Softw. Pract. Exper.40, 2 (February 2010), 131-147.
# Results will be written into a CSV file: benchmark-20140918T011322.csv
@@ -10,504 +10,504 @@
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.04 840 1619
+ 3.04 840 1619
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1169 1698
+ 32.00 1169 1698
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 195 369
+ 8.00 195 369
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.04 651 1148
+ 3.04 651 1148
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.96 129 865
+ 2.96 129 865
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.98 158 877
+ 2.98 158 877
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.96 130 879
+ 2.96 130 879
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.95 25 930
+ 2.95 25 930
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.97 27 951
+ 2.97 27 951
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.95 25 935
+ 2.95 25 935
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 2.82 235 928
+ 2.82 235 928
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 2.93 255 740
+ 2.93 255 740
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 2.77 147 395
+ 2.77 147 395
# sparsity 2
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 831 1555
+ 3.82 831 1555
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1183 1800
+ 32.00 1183 1800
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.00 220 372
+ 8.00 220 372
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.82 659 1139
+ 3.82 659 1139
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.72 116 855
+ 3.72 116 855
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.75 136 851
+ 3.75 136 851
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.72 115 853
+ 3.72 115 853
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.71 19 895
+ 3.71 19 895
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.74 22 917
+ 3.74 22 917
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.71 19 900
+ 3.71 19 900
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 3.59 230 908
+ 3.59 230 908
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 3.74 195 654
+ 3.74 195 654
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 3.49 111 366
+ 3.49 111 366
# sparsity 3
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.78 817 1519
+ 4.78 817 1519
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1019 1759
+ 32.00 1019 1759
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.02 238 370
+ 8.02 238 370
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.78 680 1121
+ 4.78 680 1121
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.67 98 825
+ 4.67 98 825
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.70 123 840
+ 4.70 123 840
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.67 102 834
+ 4.67 102 834
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.66 15 861
+ 4.66 15 861
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.69 18 895
+ 4.69 18 895
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.66 14 871
+ 4.66 14 871
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 4.54 231 904
+ 4.54 231 904
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 4.84 153 589
+ 4.84 153 589
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 4.40 83 339
+ 4.40 83 339
# sparsity 4
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.65 788 1505
+ 5.65 788 1505
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1142 1757
+ 32.00 1142 1757
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.09 242 363
+ 8.09 242 363
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.65 636 1113
+ 5.65 636 1113
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.52 92 828
+ 5.52 92 828
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.56 112 826
+ 5.56 112 826
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.52 94 826
+ 5.52 94 826
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.51 12 854
+ 5.51 12 854
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.54 15 883
+ 5.54 15 883
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.51 12 858
+ 5.51 12 858
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 5.39 218 886
+ 5.39 218 886
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 5.80 136 566
+ 5.80 136 566
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 5.32 68 319
+ 5.32 68 319
# sparsity 5
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.31 804 1490
+ 6.31 804 1490
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1105 1860
+ 32.00 1105 1860
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.40 245 330
+ 8.40 245 330
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.31 673 1121
+ 6.31 673 1121
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.19 87 832
+ 6.19 87 832
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.22 107 844
+ 6.22 107 844
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.19 88 830
+ 6.19 88 830
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.17 10 851
+ 6.17 10 851
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.20 14 883
+ 6.20 14 883
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.17 10 852
+ 6.17 10 852
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.07 217 875
+ 6.07 217 875
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 6.51 130 513
+ 6.51 130 513
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 6.13 60 307
+ 6.13 60 307
# sparsity 6
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.99 742 1431
+ 6.99 742 1431
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1163 1660
+ 32.00 1163 1660
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.99 205 290
+ 8.99 205 290
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.99 637 1107
+ 6.99 637 1107
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.87 82 821
+ 6.87 82 821
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.90 100 830
+ 6.90 100 830
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.87 82 818
+ 6.87 82 818
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.85 9 834
+ 6.85 9 834
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.88 12 865
+ 6.88 12 865
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.85 9 836
+ 6.85 9 836
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 6.75 224 877
+ 6.75 224 877
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 7.33 118 485
+ 7.33 118 485
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 6.98 54 296
+ 6.98 54 296
# sparsity 7
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.31 770 1463
+ 8.31 770 1463
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1074 1832
+ 32.00 1074 1832
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.01 203 240
+ 10.01 203 240
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.31 664 1105
+ 8.31 664 1105
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.18 73 796
+ 8.18 73 796
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.22 88 808
+ 8.22 88 808
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.18 73 792
+ 8.18 73 792
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.16 7 819
+ 8.16 7 819
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.20 10 849
+ 8.20 10 849
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.16 7 810
+ 8.16 7 810
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.05 217 851
+ 8.05 217 851
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 9.01 103 430
+ 9.01 103 430
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 8.61 47 277
+ 8.61 47 277
# sparsity 8
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.88 800 1414
+ 8.88 800 1414
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1078 1718
+ 32.00 1078 1718
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.91 211 227
+ 10.91 211 227
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.88 671 1083
+ 8.88 671 1083
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.76 70 804
+ 8.76 70 804
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.80 84 814
+ 8.80 84 814
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.76 70 800
+ 8.76 70 800
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.73 7 807
+ 8.73 7 807
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.77 9 792
+ 8.77 9 792
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.73 7 801
+ 8.73 7 801
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 8.64 211 837
+ 8.64 211 837
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 9.94 96 417
+ 9.94 96 417
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 9.51 44 268
+ 9.51 44 268
# sparsity 9
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.19 834 1442
+ 10.19 834 1442
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1200 1632
+ 32.00 1200 1632
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.16 206 212
+ 12.16 206 212
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.19 675 1092
+ 10.19 675 1092
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.07 64 804
+ 10.07 64 804
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.10 76 814
+ 10.10 76 814
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.07 63 802
+ 10.07 63 802
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.04 6 810
+ 10.04 6 810
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.08 9 841
+ 10.08 9 841
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.04 6 808
+ 10.04 6 808
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 9.94 222 858
+ 9.94 222 858
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 11.79 88 397
+ 11.79 88 397
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 11.40 38 253
+ 11.40 38 253
# sparsity 10
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.10 814 1406
+ 11.10 814 1406
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1215 1820
+ 32.00 1215 1820
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.07 207 208
+ 13.07 207 208
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.10 681 1073
+ 11.10 681 1073
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.00 60 800
+ 11.00 60 800
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.04 72 809
+ 11.04 72 809
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 11.00 60 796
+ 11.00 60 796
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.95 6 785
+ 10.95 6 785
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.99 8 815
+ 10.99 8 815
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.95 6 782
+ 10.95 6 782
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 10.84 226 860
+ 10.84 226 860
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 12.93 84 389
+ 12.93 84 389
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 12.58 36 245
+ 12.58 36 245
# sparsity 11
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.31 814 1392
+ 12.31 814 1392
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1171 1846
+ 32.00 1171 1846
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.14 172 201
+ 14.14 172 201
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.31 668 1071
+ 12.31 668 1071
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.42 56 825
+ 12.42 56 825
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.45 67 832
+ 12.45 67 832
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.42 56 821
+ 12.42 56 821
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.22 5 729
+ 12.22 5 729
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.27 8 758
+ 12.27 8 758
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.22 5 731
+ 12.22 5 731
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.07 222 836
+ 12.07 222 836
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 14.31 81 377
+ 14.31 81 377
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 14.05 33 238
+ 14.05 33 238
# sparsity 12
# generating random data...
# generating random data... ok.
# IntegratedBinaryPacking + IntegratedVariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.97 805 1375
+ 12.97 805 1375
# JustCopy
# bits per int, compress speed (mis), decompression speed (mis)
- 32.00 1160 1737
+ 32.00 1160 1737
# VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 14.72 186 193
+ 14.72 186 193
# BinaryPacking+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.97 656 1037
+ 12.97 656 1037
# NewPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.22 56 886
+ 13.22 56 886
# NewPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.24 67 891
+ 13.24 67 891
# NewPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.22 56 883
+ 13.22 56 883
# OptPFD+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.98 5 704
+ 12.98 5 704
# OptPFDS9+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 13.02 8 740
+ 13.02 8 740
# OptPFDS16+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.98 5 704
+ 12.98 5 704
# FastPFOR+VariableByte
# bits per int, compress speed (mis), decompression speed (mis)
- 12.73 223 845
+ 12.73 223 845
# Simple9
# bits per int, compress speed (mis), decompression speed (mis)
- 15.35 78 347
+ 15.35 78 347
# Simple16
# bits per int, compress speed (mis), decompression speed (mis)
- 15.15 31 225
+ 15.15 31 225
Results were written into a CSV file: benchmark-20140918T011322.csv
diff --git a/build.xml b/build.xml
index 974a14c..d02cddd 100644
--- a/build.xml
+++ b/build.xml
@@ -8,6 +8,16 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/example.java b/example.java
index b8f039d..75dfb05 100644
--- a/example.java
+++ b/example.java
@@ -4,19 +4,33 @@
public class example {
public static void main(String[] args) {
+ superSimpleExample();
unsortedExample();
basicExample();
+ basicExampleHeadless();
advancedExample();
headlessDemo();
}
+ public static void superSimpleExample() {
+ IntegratedIntCompressor iic = new IntegratedIntCompressor();
+ int[] data = new int[2342351];
+ for(int k = 0; k < data.length; ++k)
+ data[k] = k;
+ System.out.println("Compressing "+data.length+" integers using friendly interface");
+ int[] compressed = iic.compress(data);
+ int[] recov = iic.uncompress(compressed);
+ System.out.println("compressed from "+data.length*4/1024+"KB to "+compressed.length*4/1024+"KB");
+ if(!Arrays.equals(recov,data)) throw new RuntimeException("bug");
+ }
+
public static void basicExample() {
int[] data = new int[2342351];
System.out.println("Compressing "+data.length+" integers in one go");
// data should be sorted for best
//results
for(int k = 0; k < data.length; ++k)
- data[k] = k;
+ data[k] = k;
// Very important: the data is in sorted order!!! If not, you
// will get very poor compression with IntegratedBinaryPacking,
// you should use another CODEC.
@@ -25,9 +39,9 @@ public static void basicExample() {
// will be done with binary packing, and leftovers will
// be processed using variable byte
IntegratedIntegerCODEC codec = new
- IntegratedComposition(
- new IntegratedBinaryPacking(),
- new IntegratedVariableByte());
+ IntegratedComposition(
+ new IntegratedBinaryPacking(),
+ new IntegratedVariableByte());
// output vector should be large enough...
int [] compressed = new int[data.length+1024];
// compressed might not be large enough in some cases
@@ -53,17 +67,80 @@ public static void basicExample() {
*
* now uncompressing
*
+ * This assumes that we otherwise know how many integers
+ * have been compressed, or we can bound it (e.g., you know that
+ * will never need to decore more than 2000 integers).
+ * See basicExampleHeadless for a
+ * more general case where you can manually manage the compressed
+ * array size.
*/
int[] recovered = new int[data.length];
IntWrapper recoffset = new IntWrapper(0);
codec.uncompress(compressed,new IntWrapper(0),compressed.length,recovered,recoffset);
if(Arrays.equals(data,recovered))
- System.out.println("data is recovered without loss");
+ System.out.println("data is recovered without loss");
else
- throw new RuntimeException("bug"); // could use assert
+ throw new RuntimeException("bug"); // could use assert
System.out.println();
}
+
+ /**
+ * Like the basicExample, but we store the input array size manually.
+ */
+ public static void basicExampleHeadless() {
+ int[] data = new int[2342351];
+ System.out.println("Compressing " + data.length + " integers in one go using the headless approach");
+ // data should be sorted for best
+ // results
+ for (int k = 0; k < data.length; ++k)
+ data[k] = k;
+ // Very important: the data is in sorted order!!! If not, you
+ // will get very poor compression with IntegratedBinaryPacking,
+ // you should use another CODEC.
+
+ // next we compose a CODEC. Most of the processing
+ // will be done with binary packing, and leftovers will
+ // be processed using variable byte
+ SkippableIntegratedComposition codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
+ new IntegratedVariableByte());
+ int[] compressed = new int[codec.maxHeadlessCompressedLength(new IntWrapper(0), data.length)];
+
+ /**
+ *
+ * compressing
+ *
+ */
+ IntWrapper inputoffset = new IntWrapper(0);
+ IntWrapper outputoffset = new IntWrapper(1);
+ compressed[0] = data.length; // we manually store how many integers
+ codec.headlessCompress(data, inputoffset, data.length, compressed, outputoffset, new IntWrapper(0));
+ // got it!
+ // inputoffset should be at data.length but outputoffset tells
+ // us where we are...
+ System.out.println(
+ "compressed from " + data.length * 4 / 1024 + "KB to " + outputoffset.intValue() * 4 / 1024 + "KB");
+ // we can repack the data: (optional)
+ compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+ /**
+ *
+ * now uncompressing
+ *
+ */
+ int howmany = compressed[0];// we manually stored the number of
+ // compressed integers
+ int[] recovered = new int[howmany];
+ IntWrapper recoffset = new IntWrapper(0);
+ codec.headlessUncompress(compressed, new IntWrapper(1), compressed.length, recovered, recoffset, howmany, new IntWrapper(0));
+ if (Arrays.equals(data, recovered))
+ System.out.println("data is recovered without loss");
+ else
+ throw new RuntimeException("bug"); // could use assert
+ System.out.println();
+ }
+
+
/**
* This is an example to show you can compress unsorted integers
* as long as most are small.
@@ -78,9 +155,9 @@ public static void unsortedExample() {
for(int k = 0; k < N; k+=533) data[k] = 10000;
int[] compressed = new int [N+1024];// could need more
IntegerCODEC codec = new
- Composition(
- new FastPFOR(),
- new VariableByte());
+ Composition(
+ new FastPFOR(),
+ new VariableByte());
// compressing
IntWrapper inputoffset = new IntWrapper(0);
IntWrapper outputoffset = new IntWrapper(0);
@@ -93,9 +170,9 @@ public static void unsortedExample() {
IntWrapper recoffset = new IntWrapper(0);
codec.uncompress(compressed,new IntWrapper(0),compressed.length,recovered,recoffset);
if(Arrays.equals(data,recovered))
- System.out.println("data is recovered without loss");
+ System.out.println("data is recovered without loss");
else
- throw new RuntimeException("bug"); // could use assert
+ throw new RuntimeException("bug"); // could use assert
System.out.println();
}
@@ -115,16 +192,16 @@ public static void advancedExample() {
// data should be sorted for best
//results
for(int k = 0; k < data.length; ++k)
- data[k] = k;
+ data[k] = k;
// next we compose a CODEC. Most of the processing
// will be done with binary packing, and leftovers will
// be processed using variable byte, using variable byte
// only for the last chunk!
IntegratedIntegerCODEC regularcodec = new
- IntegratedBinaryPacking();
+ IntegratedBinaryPacking();
IntegratedVariableByte ivb = new IntegratedVariableByte();
IntegratedIntegerCODEC lastcodec = new
- IntegratedComposition(regularcodec,ivb);
+ IntegratedComposition(regularcodec,ivb);
// output vector should be large enough...
int [] compressed = new int[TotalSize+1024];
@@ -138,7 +215,7 @@ public static void advancedExample() {
IntWrapper outputoffset = new IntWrapper(0);
for(int k = 0; k < TotalSize / ChunkSize; ++k)
regularcodec.compress(data,inputoffset,ChunkSize,compressed,outputoffset);
- lastcodec.compress(data, inputoffset, TotalSize % ChunkSize, compressed, outputoffset);
+ lastcodec.compress(data, inputoffset, TotalSize % ChunkSize, compressed, outputoffset);
// got it!
// inputoffset should be at data.length but outputoffset tells
// us where we are...
@@ -178,46 +255,48 @@ public static void advancedExample() {
}
- /*
- * Demo of the headless approach where we must supply the array length
- */
- public static void headlessDemo() {
- System.out.println("Compressing arrays with minimal header...");
- int[] uncompressed1 = {1,2,1,3,1};
- int[] uncompressed2 = {3,2,4,6,1};
+ /*
+ * Demo of the headless approach where we must supply the array length
+ */
+ public static void headlessDemo() {
+ System.out.println("Compressing arrays with minimal header...");
+ int[] uncompressed1 = {1,2,1,3,1};
+ int[] uncompressed2 = {3,2,4,6,1};
- int[] compressed = new int[uncompressed1.length+uncompressed2.length+1024];
+ SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
- SkippableIntegerCODEC codec = new SkippableComposition(new BinaryPacking(), new VariableByte());
+ int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed1.length)
+ + codec.maxHeadlessCompressedLength(new IntWrapper(0), uncompressed2.length);
+ int[] compressed = new int[maxCompressedLength];
- // compressing
- IntWrapper outPos = new IntWrapper();
+ // compressing
+ IntWrapper outPos = new IntWrapper();
- IntWrapper previous = new IntWrapper();
+ IntWrapper previous = new IntWrapper();
- codec.headlessCompress(uncompressed1,new IntWrapper(),uncompressed1.length,compressed,outPos);
- int length1 = outPos.get() - previous.get();
- previous = new IntWrapper(outPos.get());
- codec.headlessCompress(uncompressed2,new IntWrapper(),uncompressed2.length,compressed,outPos);
- int length2 = outPos.get() - previous.get();
+ codec.headlessCompress(uncompressed1,new IntWrapper(),uncompressed1.length,compressed,outPos);
+ int length1 = outPos.get() - previous.get();
+ previous = new IntWrapper(outPos.get());
+ codec.headlessCompress(uncompressed2,new IntWrapper(),uncompressed2.length,compressed,outPos);
+ int length2 = outPos.get() - previous.get();
- compressed = Arrays.copyOf(compressed,length1 + length2);
- System.out.println("compressed unsorted integers from "+uncompressed1.length*4+"B to "+length1*4+"B");
- System.out.println("compressed unsorted integers from "+uncompressed2.length*4+"B to "+length2*4+"B");
- System.out.println("Total compressed output "+compressed.length);
+ compressed = Arrays.copyOf(compressed,length1 + length2);
+ System.out.println("compressed unsorted integers from "+uncompressed1.length*4+"B to "+length1*4+"B");
+ System.out.println("compressed unsorted integers from "+uncompressed2.length*4+"B to "+length2*4+"B");
+ System.out.println("Total compressed output "+compressed.length);
- int[] recovered1 = new int[uncompressed1.length];
- int[] recovered2 = new int[uncompressed1.length];
- IntWrapper inPos = new IntWrapper();
- System.out.println("Decoding first array starting at pos = "+inPos);
- codec.headlessUncompress(compressed,inPos, compressed.length, recovered1, new IntWrapper(0), uncompressed1.length);
- System.out.println("Decoding second array starting at pos = "+inPos);
- codec.headlessUncompress(compressed,inPos, compressed.length, recovered2, new IntWrapper(0), uncompressed2.length);
- if(!Arrays.equals(uncompressed1,recovered1)) throw new RuntimeException("First array does not match.");
- if(!Arrays.equals(uncompressed2,recovered2)) throw new RuntimeException("Second array does not match.");
- System.out.println("The arrays match, your code is probably ok.");
+ int[] recovered1 = new int[uncompressed1.length];
+ int[] recovered2 = new int[uncompressed1.length];
+ IntWrapper inPos = new IntWrapper();
+ System.out.println("Decoding first array starting at pos = "+inPos);
+ codec.headlessUncompress(compressed,inPos, compressed.length, recovered1, new IntWrapper(0), uncompressed1.length);
+ System.out.println("Decoding second array starting at pos = "+inPos);
+ codec.headlessUncompress(compressed,inPos, compressed.length, recovered2, new IntWrapper(0), uncompressed2.length);
+ if(!Arrays.equals(uncompressed1,recovered1)) throw new RuntimeException("First array does not match.");
+ if(!Arrays.equals(uncompressed2,recovered2)) throw new RuntimeException("Second array does not match.");
+ System.out.println("The arrays match, your code is probably ok.");
- }
+ }
}
diff --git a/examples/Axelbrooke/CompressBitmap.java b/examples/Axelbrooke/CompressBitmap.java
new file mode 100644
index 0000000..c641c65
--- /dev/null
+++ b/examples/Axelbrooke/CompressBitmap.java
@@ -0,0 +1,97 @@
+import java.io.*;
+import java.nio.file.*;
+import java.util.*;
+import java.util.zip.*;
+
+import me.lemire.integercompression.differential.*;
+import me.lemire.integercompression.*;
+
+
+public class CompressBitmap {
+
+ public static void main(String[] args) throws IOException {
+ if(args.length == 0) {
+ System.out.println("usage: please provide the file name of a bitmap binary file.");
+ return;
+ }
+ System.out.println("loading file "+args[0]+" as a bitmap");
+ int[] data = fromBitsetFileToArray(args[0]);
+ System.out.println("Compressing "+data.length+" integers");
+ int[] compressed = iic.compress(data);
+ int[] recov = iic.uncompress(compressed);
+ System.out.println("compressed from "+data.length*4/1024+"KB to "+compressed.length*4/1024+"KB");
+ System.out.println("ratio: "+Math.round(data.length*1.0/compressed.length));
+
+ if(!Arrays.equals(recov,data)) throw new RuntimeException("bug");
+
+ long bef,aft;
+ bef = System.nanoTime();
+ recov = iic.uncompress(compressed);
+ aft = System.nanoTime();
+
+ System.out.println("decoding speed:"+Math.round(data.length*1000.0/(aft-bef))+" millions of integers per second");
+
+
+ bef = System.nanoTime();
+ compressed = iic.compress(data);
+ aft = System.nanoTime();
+
+ System.out.println("encoding speed:"+Math.round(data.length*1000.0/(aft-bef))+" millions of integers per second");
+
+ System.out.println("note: with a bit of effort, speed can be much higher.");
+
+
+ System.out.println();
+ zipStats(args[0]);
+
+
+ }
+
+ static IntegratedIntCompressor iic = new IntegratedIntCompressor(
+ new SkippableIntegratedComposition(
+ new IntegratedBinaryPacking(),
+ new IntegratedVariableByte()));
+
+ public static int[] fromBitsetFileToArray(String filename) throws IOException {
+ Path path = Paths.get(filename);
+ byte[] data = Files.readAllBytes(path);
+ // we determine cardinality
+ int card = 0;
+ for(int k = 0 ; k < data.length; ++k) {
+ int bv = data[k] & 0xFF;
+ card += Integer.bitCount(bv);
+ }
+ int[] answer = new int[card];
+ int pos = 0;
+ for(int k = 0 ; k < data.length; ++k) {
+ int bv = data[k] & 0xFF;
+ for(int b = 0 ; b < 8; ++b)
+ if ( ( (bv >> b) & 1 ) == 1) {
+ answer[pos++] = b + k * 8;
+ }
+ }
+ if(pos != card) throw new RuntimeException("bug");
+ return answer;
+ }
+
+ public static void zipStats(String filename) throws IOException {
+ Path path = Paths.get(filename);
+ byte[] input = Files.readAllBytes(path);
+ System.out.println("I will try to compress the original bitmap using zip.");
+
+ long bef = System.nanoTime();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ZipOutputStream zos = new ZipOutputStream(baos);
+ zos.setLevel(9);
+ ZipEntry entry = new ZipEntry(filename);
+ entry.setSize(input.length);
+ zos.putNextEntry(entry);
+ zos.write(input);
+ zos.closeEntry();
+ zos.close();
+ byte[] result = baos.toByteArray();
+ long aft = System.nanoTime();
+ System.out.println("zip encoding speed:"+input.length*1000.0/(aft-bef)+" million of bytes per second");
+ System.out.println("zip compression ratio at best level : "+input.length * 1.0 / result.length);
+ }
+}
\ No newline at end of file
diff --git a/examples/Axelbrooke/README.md b/examples/Axelbrooke/README.md
new file mode 100644
index 0000000..fad997e
--- /dev/null
+++ b/examples/Axelbrooke/README.md
@@ -0,0 +1,25 @@
+# Bitset compression example
+
+A bitset can be considered as an array of integers.
+Suppose you want to compress it quickly (at a rate of
+ millions of integers per second or better).
+
+ The ``CompressBitmap.java`` file shows how it can be done.
+
+## Usage
+
+```bash
+./run.sh
+
+loading file example_bitmap.bin as a bitmap
+Compressing 1784073 integers
+compressed from 6969KB to 1348KB
+ratio: 5
+decoding speed:199 millions of integers per second
+encoding speed:124 millions of integers per second
+note: with a bit of effort, speed can be much higher.
+
+I will try to compress the original bitmap using zip.
+zip encoding speed:0.4458317816544745 million of bytes per second
+zip compression ratio at best level : 2.255323798747234
+```
diff --git a/examples/Axelbrooke/example_bitmap.bin b/examples/Axelbrooke/example_bitmap.bin
new file mode 100644
index 0000000..7a00138
Binary files /dev/null and b/examples/Axelbrooke/example_bitmap.bin differ
diff --git a/examples/Axelbrooke/run.sh b/examples/Axelbrooke/run.sh
new file mode 100755
index 0000000..59bcd94
--- /dev/null
+++ b/examples/Axelbrooke/run.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+echo "please be patient as I build the library and the example"
+ mvn -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -f ../../pom.xml package > /dev/null && javac -cp "../../target/*" CompressBitmap.java && java -cp "../../target/*":. CompressBitmap example_bitmap.bin
diff --git a/examples/vector/Example.java b/examples/vector/Example.java
new file mode 100644
index 0000000..e8d2455
--- /dev/null
+++ b/examples/vector/Example.java
@@ -0,0 +1,67 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+import java.util.Arrays;
+import me.lemire.integercompression.FastPFOR;
+import me.lemire.integercompression.IntWrapper;
+import me.lemire.integercompression.Composition;
+import me.lemire.integercompression.IntegerCODEC;
+import me.lemire.integercompression.VariableByte;
+import me.lemire.integercompression.vector.VectorFastPFOR;
+
+public class Example {
+ public static void main(String[] args) {
+ if (args.length == 0)
+ throw new IllegalArgumentException();
+
+ // pass 0 for Vector compressor , non-zero for default compressor
+ int compressorToUse = Integer.parseInt(args[0]);
+
+ final int N = 1310720;
+ int[] data = new int[N];
+
+ // 2-bit data
+ for (int k = 0; k < N; k += 1)
+ data[k] = 3;
+
+ // a few large values
+ for (int k = 0; k < N; k += 5)
+ data[k] = 100;
+ for (int k = 0; k < N; k += 533)
+ data[k] = 10000;
+
+ int[] compressed = new int[N + 1024];
+
+ IntegerCODEC codec = new Composition(
+ compressorToUse == 0 ? new VectorFastPFOR() : new FastPFOR(),
+ new VariableByte());
+
+ IntWrapper inputoffset = new IntWrapper(0);
+ IntWrapper outputoffset = new IntWrapper(0);
+
+ codec.compress(data, inputoffset, data.length, compressed, outputoffset);
+
+ System.out.println("compressed unsorted integers from " +
+ data.length * 4 / 1024 + "KB to " +
+ outputoffset.intValue() * 4 / 1024 + "KB");
+
+ compressed = Arrays.copyOf(compressed, outputoffset.intValue());
+
+ int[] recovered = new int[N];
+ IntWrapper recoffset = new IntWrapper(0);
+
+ codec.uncompress(compressed, new IntWrapper(0), compressed.length,
+ recovered, recoffset);
+
+ System.out.println("compressed length = " + compressed.length +
+ ", uncompressed length = " + recoffset.intValue());
+
+ if (Arrays.equals(data, recovered))
+ System.out.println("data is recovered without loss");
+ else
+ throw new RuntimeException("bug"); // could use assert
+
+ System.out.println();
+ }
+}
diff --git a/examples/vector/README.md b/examples/vector/README.md
new file mode 100644
index 0000000..cbcbfeb
--- /dev/null
+++ b/examples/vector/README.md
@@ -0,0 +1,12 @@
+Compile
+-------
+```
+javac -cp Example.java
+```
+
+Run
+---
+```
+java --add-modules jdk.incubator.vector -cp Example 0
+```
+
diff --git a/jitpack.yml b/jitpack.yml
new file mode 100644
index 0000000..255e0f4
--- /dev/null
+++ b/jitpack.yml
@@ -0,0 +1,5 @@
+jdk:
+ - openjdk21
+before_install:
+ - sdk install java 21-open
+ - sdk use java 21-open
diff --git a/pom.xml b/pom.xml
index c985d64..33db8e6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2,12 +2,14 @@
4.0.0me.lemire.integercompressionJavaFastPFOR
- 0.1.4-SNAPSHOT
+ 0.3.11-SNAPSHOTjar
- 1.5
- 1.5
+ 21
+ 21
+ 21UTF-8
+ 0.8.5
@@ -18,18 +20,31 @@
- scm:git:git@github.com:lemire/JavaFastPFOR.git
- scm:git:git@github.com:lemire/JavaFastPFOR.git
- scm:git:git@github.com:lemire/JavaFastPFOR.git
+ scm:git:https://github.com/fast-pack/JavaFastPFOR.git
+ scm:git:https://github.com/fast-pack/JavaFastPFOR.git
+ scm:git:https://github.com/fast-pack/JavaFastPFOR.git
+ HEAD
+
+
+
+ sonatype-central-portal
+ Sonatype Central Portal
+ https://central.sonatype.com/repository/maven-snapshots/
+
+
+ sonatype-central-portal
+ Sonatype Central Portal
+ https://repo.maven.apache.org/maven2/
+
+
+
lemireDaniel Lemire
- lemire@gmail.com
+ daniel@lemire.mehttp://lemire.me/en/
- LICEF Research Center
- http://licef.caarchitectdeveloper
@@ -45,21 +60,64 @@
junitjunit
- 4.10
+ 4.13.1
+ test
+
+
+ org.roaringbitmap
+ RoaringBitmap
+ 0.9.35testGitHub Issue Tracking
- https://github.com/lemire/JavaFastPFOR/issues
+ https://github.com/fast-pack/JavaFastPFOR/issues
-
- org.sonatype.oss
- oss-parent
- 5
-
+
+
+ eu.maveniverse.maven.njord
+ extension3
+ ${njord.version}
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.12.1
+
+ 21
+ 21
+
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ me/lemire/integercompression/vector/*
+ module-info.java
+
+
+
+
+
+
+ org.apache.felixmaven-bundle-plugin
@@ -78,57 +136,131 @@
1.1me.lemire.integercompression.benchmarktools.Benchmark
-
org.apache.maven.plugins
- maven-gpg-plugin
- 1.4
+ maven-javadoc-plugin
+ 3.6.3
+
+ me.lemire.integercompression.vector;com.kamikaze.pfordelta:me.lemire.integercompression.benchmarktools
+
- sign-artifacts
- verify
+ attach-javadocs
- sign
+ jarorg.apache.maven.plugins
- maven-javadoc-plugin
- 2.8
+ maven-source-plugin
+ 2.1.2
- attach-javadocs
+ attach-sourcesjar
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.13
+
+
+ me/lemire/integercompression/Kamikaze
+ com/kamikaze/pfordelta/*
+ me/lemire/integercompression/benchmarktools/*
+
+
+
+
+ prepare-agent
+
+ prepare-agent
+
+
+
+ org.apache.maven.plugins
- maven-source-plugin
- 2.1.2
+ maven-release-plugin
+ 3.0.1
+
+ deploy
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 3.2.8
- attach-sources
+ sign-artifacts
+ verify
- jar
+ sign
+
+
+
+
+ eu.maveniverse.maven.plugins
+ njord
+ ${njord.version}
+
+
+ maven-clean-plugin
+ 2.5
+
+
+ maven-deploy-plugin
+ 2.8.1
+
+
+ maven-install-plugin
+ 2.5.1
+
+
+ maven-jar-plugin
+ 2.4
+
+
+ maven-javadoc-plugin
+ 2.9.1
+
+
+ maven-resources-plugin
+ 2.6
+
+
+ maven-site-plugin
+ 3.3
+
+
+ maven-source-plugin
+ 2.2.1
+
+
+ maven-surefire-plugin
+ 2.17
+
+
+ JavaFastPFOR
- https://github.com/lemire/JavaFastPFOR/
+ https://github.com/fast-pack/JavaFastPFOR/
-It is a library to compress and uncompress arrays of integers
-very fast. The assumption is that most (but not all) values in
-your array use less than 32 bits.
+A library to compress and uncompress arrays of integers
+very quickly.
diff --git a/src/main/java/me/lemire/integercompression/BinaryPacking.java b/src/main/java/me/lemire/integercompression/BinaryPacking.java
index 94b4534..ce37ff0 100644
--- a/src/main/java/me/lemire/integercompression/BinaryPacking.java
+++ b/src/main/java/me/lemire/integercompression/BinaryPacking.java
@@ -37,8 +37,9 @@
* @author Daniel Lemire
*/
public final class BinaryPacking implements IntegerCODEC, SkippableIntegerCODEC {
- final static int BLOCK_SIZE = 32;
-
+ public final static int BLOCK_SIZE = 32;
+ private static final int MAX_BIT_WIDTH = Integer.SIZE;
+
@Override
public void compress(int[] in, IntWrapper inpos, int inlength,
int[] out, IntWrapper outpos) {
@@ -57,27 +58,27 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
int tmpoutpos = outpos.get();
int s = inpos.get();
for (; s + BLOCK_SIZE * 4 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 4) {
- final int mbits1 = Util.maxbits(in, s, 32);
- final int mbits2 = Util.maxbits(in, s + 32, 32);
- final int mbits3 = Util.maxbits(in, s + 2 * 32, 32);
- final int mbits4 = Util.maxbits(in, s + 3 * 32, 32);
+ final int mbits1 = Util.maxbits(in, s, BLOCK_SIZE);
+ final int mbits2 = Util.maxbits(in, s + BLOCK_SIZE, BLOCK_SIZE);
+ final int mbits3 = Util.maxbits(in, s + 2 * BLOCK_SIZE, BLOCK_SIZE);
+ final int mbits4 = Util.maxbits(in, s + 3 * BLOCK_SIZE, BLOCK_SIZE);
out[tmpoutpos++] = (mbits1 << 24) | (mbits2 << 16)
| (mbits3 << 8) | (mbits4);
BitPacking.fastpackwithoutmask(in, s, out, tmpoutpos,
mbits1);
tmpoutpos += mbits1;
- BitPacking.fastpackwithoutmask(in, s + 32, out,
+ BitPacking.fastpackwithoutmask(in, s + BLOCK_SIZE, out,
tmpoutpos, mbits2);
tmpoutpos += mbits2;
- BitPacking.fastpackwithoutmask(in, s + 2 * 32, out,
+ BitPacking.fastpackwithoutmask(in, s + 2 * BLOCK_SIZE, out,
tmpoutpos, mbits3);
tmpoutpos += mbits3;
- BitPacking.fastpackwithoutmask(in, s + 3 * 32, out,
+ BitPacking.fastpackwithoutmask(in, s + 3 * BLOCK_SIZE, out,
tmpoutpos, mbits4);
tmpoutpos += mbits4;
}
for (; s < inpos.get() + inlength; s += BLOCK_SIZE ) {
- final int mbits = Util.maxbits(in, s, 32);
+ final int mbits = Util.maxbits(in, s, BLOCK_SIZE);
out[tmpoutpos++] = mbits;
BitPacking.fastpackwithoutmask(in, s, out, tmpoutpos,
mbits);
@@ -113,12 +114,12 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
BitPacking.fastunpack(in, tmpinpos, out, s, mbits1);
tmpinpos += mbits1;
BitPacking
- .fastunpack(in, tmpinpos, out, s + 32, mbits2);
+ .fastunpack(in, tmpinpos, out, s + BLOCK_SIZE, mbits2);
tmpinpos += mbits2;
- BitPacking.fastunpack(in, tmpinpos, out, s + 2 * 32,
+ BitPacking.fastunpack(in, tmpinpos, out, s + 2 * BLOCK_SIZE,
mbits3);
tmpinpos += mbits3;
- BitPacking.fastunpack(in, tmpinpos, out, s + 3 * 32,
+ BitPacking.fastunpack(in, tmpinpos, out, s + 3 * BLOCK_SIZE,
mbits4);
tmpinpos += mbits4;
}
@@ -131,7 +132,16 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
outpos.add(outlength);
inpos.set(tmpinpos);
}
-
+
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int blockCount = inlength / BLOCK_SIZE;
+ int headersSizeInInts = blockCount / Integer.BYTES + (blockCount % Integer.BYTES);
+ int blocksSizeInInts = blockCount * MAX_BIT_WIDTH;
+ compressedPositions.add(blockCount * BLOCK_SIZE);
+ return headersSizeInInts + blocksSizeInInts;
+ }
+
@Override
public String toString() {
return this.getClass().getSimpleName();
diff --git a/src/main/java/me/lemire/integercompression/BitPacking.java b/src/main/java/me/lemire/integercompression/BitPacking.java
index e83c9e0..8652be4 100644
--- a/src/main/java/me/lemire/integercompression/BitPacking.java
+++ b/src/main/java/me/lemire/integercompression/BitPacking.java
@@ -1690,7 +1690,7 @@ protected static void fastpack9(final int[] in, int inpos,
}
/**
- * Unpack 32 integers
+ * Pack without mask 32 integers
*
* @param in
* source array
@@ -3005,7 +3005,7 @@ protected static void fastpackwithoutmask9(final int[] in, int inpos,
}
/**
- * Pack the 32 integers
+ * Unpack the 32 integers
*
* @param in
* source array
diff --git a/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java b/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java
index 47d4f57..6e8f903 100644
--- a/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java
@@ -18,9 +18,9 @@ public interface ByteIntegerCODEC {
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was
- * read and written to if 12 ints (inlength = 12) are compressed to 3
+ * read and written to. If 12 ints (inlength = 12) are compressed to 3
* bytes, then inpos will be incremented by 12 while outpos will be
- * incremented by 3 we use IntWrapper to pass the values by reference.
+ * incremented by 3. We use IntWrapper to pass the values by reference.
*
* @param in
* input array
diff --git a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java
index 1988d47..2f8c709 100644
--- a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java
+++ b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java
@@ -13,7 +13,7 @@
*
* @author MURAOKA Taro http://github.com/koron
*/
-public final class DeltaZigzagVariableByte implements IntegerCODEC {
+public class DeltaZigzagVariableByte implements IntegerCODEC {
@Override
public String toString() {
@@ -27,7 +27,7 @@ public void compress(int[] inBuf, IntWrapper inPos, int inLen,
return;
}
- ByteBuffer byteBuf = ByteBuffer.allocateDirect(inLen * 5 + 3);
+ ByteBuffer byteBuf = makeBuffer(inLen * 5 + 3);
DeltaZigzagEncoding.Encoder ctx = new DeltaZigzagEncoding.Encoder(0);
// Delta+Zigzag+VariableByte encoding.
@@ -105,7 +105,7 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen,
int ip = inPos.get();
int op = outPos.get();
- int vbcNum = 0, vbcShift = 24; // Varialbe Byte Context.
+ int vbcNum = 0, vbcShift = 24; // Variable Byte Context.
final int inPosLast = ip + inLen;
while (ip < inPosLast) {
// Fetch a byte value.
@@ -127,4 +127,18 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen,
outPos.set(op);
inPos.set(inPosLast);
}
+
+ /**
+ * Creates a new buffer of the requested size.
+ *
+ * In case you need a different way to allocate buffers, you can override this method
+ * with a custom behavior. The default implementation allocates a new Java direct
+ * {@link ByteBuffer} on each invocation.
+ *
+ * @param sizeInBytes
+ * @return
+ */
+ protected ByteBuffer makeBuffer(int sizeInBytes) {
+ return ByteBuffer.allocateDirect(sizeInBytes);
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/FastPFOR.java b/src/main/java/me/lemire/integercompression/FastPFOR.java
index 314c9f9..5475496 100644
--- a/src/main/java/me/lemire/integercompression/FastPFOR.java
+++ b/src/main/java/me/lemire/integercompression/FastPFOR.java
@@ -14,13 +14,13 @@
* This is a patching scheme designed for speed.
* It encodes integers in blocks of integers within pages of
* up to 65536 integers. Note that it is important, to get good
- * compression and good performance, to use sizeable blocks (greater than 1024 integers).
+ * compression and good performance, to use sizeable arrays (greater than 1024 integers).
* For arrays containing a number of integers that is not divisible by BLOCK_SIZE, you should use
- * it in conjunction with another CODEC:
- *
+ * it in conjunction with another CODEC:
+ *
* IntegerCODEC ic = new Composition(new FastPFOR(), new VariableByte()).
*
- * For details, please see
+ * For details, please see:
*
* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second
* through vectorization Software: Practice & Experience
@@ -29,23 +29,30 @@
*
*
For sufficiently compressible and long arrays, it is faster and better than other PFOR
* schemes.
* Adapted by D. Lemire from the Apache Lucene project.
*
+ *
* @author Daniel Lemire
*/
public final class S9 {
- /**
- * Estimate size of the compressed output.
- *
- * @param in
- * array to compress
- * @param currentPos
- * where to start reading
- * @param inlength
- * how many integers to read
- * @return estimated size of the output (in 32-bit integers)
- */
- public static int estimatecompress(int[] in, int currentPos,
- int inlength) {
- int tmpoutpos = 0;
- int finalpos = currentPos + inlength;
- outer: while (currentPos < finalpos) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int compressedNum = codeNum[selector];
- if (finalpos <= currentPos + compressedNum - 1)
- compressedNum = finalpos - currentPos;
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++)
- if (max <= in[currentPos + i])
- continue mainloop;
- currentPos += compressedNum;
- ++tmpoutpos;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- tmpoutpos++;
- currentPos++;
- }
- return tmpoutpos;
+ /**
+ * Estimate size of the compressed output.
+ *
+ * @param in
+ * array to compress
+ * @param currentPos
+ * where to start reading
+ * @param inlength
+ * how many integers to read
+ * @return estimated size of the output (in 32-bit integers)
+ */
+ public static int estimatecompress(int[] in, int currentPos, int inlength) {
+ int tmpoutpos = 0;
+ int finalpos = currentPos + inlength;
+ outer: while (currentPos < finalpos) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
+
+ int compressedNum = codeNum[selector];
+ if (finalpos <= currentPos + compressedNum - 1)
+ compressedNum = finalpos - currentPos;
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++)
+ if (Util.smallerorequalthan(max , in[currentPos + i]))
+ continue mainloop;
+ currentPos += compressedNum;
+ ++tmpoutpos;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ tmpoutpos++;
+ currentPos++;
+
}
+ return tmpoutpos;
+ }
- /**
- * Compress an integer array using Simple9
- *
- *
- * @param in
- * array to compress
- * @param currentPos
- * where to start reading
- * @param inlength
- * how many integers to read
- * @param out output array
- * @param tmpoutpos location in the output array
- * @return the number of 32-bit words written (in compressed form)
- */
- public static int compress(int[] in, int currentPos, int inlength,
- int out[], int tmpoutpos) {
- int origtmpoutpos = tmpoutpos;
- int finalpos = currentPos + inlength;
- outer: while (currentPos < finalpos) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int res = 0;
- int compressedNum = codeNum[selector];
- if (finalpos <= currentPos + compressedNum - 1)
- compressedNum = finalpos - currentPos;
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++) {
- if (max <= in[currentPos + i])
- continue mainloop;
- res = (res << b) + in[currentPos + i];
- }
- if (compressedNum != codeNum[selector])
- res <<= (codeNum[selector] - compressedNum)
- * b;
- res |= selector << 28;
- out[tmpoutpos++] = res;
- currentPos += compressedNum;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+ /**
+ * Compress an integer array using Simple9
+ *
+ *
+ * @param in
+ * array to compress
+ * @param currentPos
+ * where to start reading
+ * @param inlength
+ * how many integers to read
+ * @param out
+ * output array
+ * @param tmpoutpos
+ * location in the output array
+ * @return the number of 32-bit words written (in compressed form)
+ */
+ public static int compress(int[] in, int currentPos, int inlength, int out[], int tmpoutpos) {
+ int origtmpoutpos = tmpoutpos;
+ int finalpos = currentPos + inlength;
+ outer: while (currentPos < finalpos) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
+ int res = 0;
+ int compressedNum = codeNum[selector];
+ if (finalpos <= currentPos + compressedNum - 1)
+ compressedNum = finalpos - currentPos;
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++) {
+ if (Util.smallerorequalthan(max, in[currentPos + i]))
+ continue mainloop;
+ res = (res << b) + in[currentPos + i];
}
- return tmpoutpos - origtmpoutpos;
+ if (compressedNum != codeNum[selector])
+ res <<= (codeNum[selector] - compressedNum) * b;
+ res |= selector << 28;
+ out[tmpoutpos++] = res;
+ currentPos += compressedNum;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ out[tmpoutpos++] = in[currentPos++] | (selector << 28);
}
+ return tmpoutpos - origtmpoutpos;
+ }
- /**
- * Uncompressed data from an input array into an output array
- *
- * @param in input array (in compressed form)
- * @param tmpinpos starting location in the compressed input array
- * @param inlength how much data we wish the read (in 32-bit words)
- * @param out output array (in decompressed form)
- * @param currentPos current position in the output array
- * @param outlength available data in the output array
- */
- public static void uncompress(int[] in, int tmpinpos, int inlength,
- int[] out, int currentPos, int outlength) {
- int finallength = currentPos + outlength;
+ /**
+ * Uncompressed data from an input array into an output array
+ *
+ * @param in
+ * input array (in compressed form)
+ * @param tmpinpos
+ * starting location in the compressed input array
+ * @param inlength
+ * how much data we wish the read (in 32-bit words)
+ * @param out
+ * output array (in decompressed form)
+ * @param currentPos
+ * current position in the output array
+ * @param outlength
+ * available data in the output array
+ */
+ public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
+ int finallength = currentPos + outlength;
- while (currentPos < finallength) {
- int val = in[tmpinpos++];
- int header = val >>> 28;
- switch (header) {
- case 0: { // number : 28, bitwidth : 1
- final int howmany = finallength - currentPos < 28 ? finallength
- - currentPos
- : 28;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (k + 4)) >>> 31;
- }
- break;
- }
- case 1: { // number : 14, bitwidth : 2
- final int howmany = finallength - currentPos < 14 ? finallength
- - currentPos
- : 14;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (2 * k + 4)) >>> 30;
- }
- break;
- }
- case 2: { // number : 9, bitwidth : 3
- final int howmany = finallength - currentPos < 9 ? finallength
- - currentPos
- : 9;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (3 * k + 5)) >>> 29;
- }
- break;
- }
- case 3: { // number : 7, bitwidth : 4
- final int howmany = finallength - currentPos < 7 ? finallength
- - currentPos
- : 7;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (4 * k + 4)) >>> 28;
- }
- break;
- }
- case 4: { // number : 5, bitwidth : 5
- final int howmany = finallength - currentPos < 5 ? finallength
- - currentPos
- : 5;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (5 * k + 7)) >>> 27;
- }
- break;
- }
- case 5: { // number : 4, bitwidth : 7
- final int howmany = finallength - currentPos < 4 ? finallength
- - currentPos
- : 4;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (7 * k + 4)) >>> 25;
- }
- break;
- }
- case 6: { // number : 3, bitwidth : 9
- final int howmany = finallength - currentPos < 3 ? finallength
- - currentPos
- : 3;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (9 * k + 5)) >>> 23;
- }
- break;
- }
- case 7: { // number : 2, bitwidth : 14
- final int howmany = finallength - currentPos < 2 ? finallength
- - currentPos
- : 2;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (14 * k + 4)) >>> 18;
- }
- break;
- }
- case 8: { // number : 1, bitwidth : 28
- out[currentPos++] = (val << 4) >>> 4;
- break;
- }
- default: {
- throw new RuntimeException("shouldn't happen");
- }
- }
+ while (currentPos < finallength) {
+ int val = in[tmpinpos++];
+ int header = val >>> 28;
+ switch (header) {
+ case 0: { // number : 28, bitwidth : 1
+ final int howmany = finallength - currentPos < 28 ? finallength - currentPos : 28;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (k + 4)) >>> 31;
}
-
+ break;
+ }
+ case 1: { // number : 14, bitwidth : 2
+ final int howmany = finallength - currentPos < 14 ? finallength - currentPos : 14;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+ }
+ break;
+ }
+ case 2: { // number : 9, bitwidth : 3
+ final int howmany = finallength - currentPos < 9 ? finallength - currentPos : 9;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+ }
+ break;
+ }
+ case 3: { // number : 7, bitwidth : 4
+ final int howmany = finallength - currentPos < 7 ? finallength - currentPos : 7;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+ }
+ break;
+ }
+ case 4: { // number : 5, bitwidth : 5
+ final int howmany = finallength - currentPos < 5 ? finallength - currentPos : 5;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+ }
+ break;
+ }
+ case 5: { // number : 4, bitwidth : 7
+ final int howmany = finallength - currentPos < 4 ? finallength - currentPos : 4;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+ }
+ break;
+ }
+ case 6: { // number : 3, bitwidth : 9
+ final int howmany = finallength - currentPos < 3 ? finallength - currentPos : 3;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+ }
+ break;
+ }
+ case 7: { // number : 2, bitwidth : 14
+ final int howmany = finallength - currentPos < 2 ? finallength - currentPos : 2;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+ }
+ break;
+ }
+ case 8: { // number : 1, bitwidth : 28
+ out[currentPos++] = (val << 4) >>> 4;
+ break;
+ }
+ default: {
+ throw new RuntimeException("shouldn't happen");
+ }
+ }
}
- private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+ }
+
+ private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
- private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+ private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
}
diff --git a/src/main/java/me/lemire/integercompression/Simple16.java b/src/main/java/me/lemire/integercompression/Simple16.java
index 9562c3a..2b7f27f 100644
--- a/src/main/java/me/lemire/integercompression/Simple16.java
+++ b/src/main/java/me/lemire/integercompression/Simple16.java
@@ -1,8 +1,5 @@
package me.lemire.integercompression;
-
-
-
/**
* This is an implementation of the popular Simple16 scheme. It is limited to
* 28-bit integers (between 0 and 2^28-1).
@@ -14,10 +11,9 @@
* Adapted by D. Lemire from the Apache Lucene project.
*
*/
-public final class Simple16 implements IntegerCODEC,SkippableIntegerCODEC {
+public final class Simple16 implements IntegerCODEC, SkippableIntegerCODEC {
- public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[],
- IntWrapper outpos) {
+ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
int i_inpos = inpos.get();
int i_outpos = outpos.get();
final int finalin = i_inpos + inlength;
@@ -31,7 +27,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[]
inpos.set(i_inpos);
outpos.set(i_outpos);
}
-
+
/**
* Compress an integer array using Simple16
*
@@ -47,15 +43,13 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[]
* the number of elements to be compressed
* @return the number of compressed integers
*/
- public static final int compressblock(int[] out, int outOffset, int[] in,
- int inOffset, int n) {
+ public static final int compressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
int numIdx, j, num, bits;
for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
out[outOffset] = numIdx << S16_BITSSIZE;
num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
- for (j = 0, bits = 0; (j < num)
- && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
+ for (j = 0, bits = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
out[outOffset] |= (in[inOffset + j] << bits);
bits += S16_BITS[numIdx][j];
j++;
@@ -69,7 +63,6 @@ public static final int compressblock(int[] out, int outOffset, int[] in,
return -1;
}
-
/**
* Decompress an integer array using Simple16
*
@@ -85,23 +78,19 @@ public static final int compressblock(int[] out, int outOffset, int[] in,
* the number of elements to be compressed
* @return the number of processed integers
*/
- public static final int decompressblock(int[] out, int outOffset, int[] in,
- int inOffset, int n) {
+ public static final int decompressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
int numIdx, j = 0, bits = 0;
numIdx = in[inOffset] >>> S16_BITSSIZE;
int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n;
for (j = 0, bits = 0; j < num; j++) {
- out[outOffset + j] = (in[inOffset] >>> bits)
- & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
+ out[outOffset + j] = (in[inOffset] >>> bits) & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
bits += S16_BITS[numIdx][j];
}
return num;
}
-
@Override
- public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos,int num) {
+ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
int i_inpos = inpos.get();
int i_outpos = outpos.get();
while (num > 0) {
@@ -114,6 +103,12 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
outpos.set(i_outpos);
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ compressedPositions.add(inlength);
+ return inlength;
+ }
+
/**
* Uncompress data from an array to another array.
*
@@ -133,12 +128,10 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
* @param outlength
* number of integers we want to decode
*/
- public static void uncompress(int[] in, int tmpinpos, int inlength,
- int[] out, int currentPos, int outlength) {
+ public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
final int finalpos = tmpinpos + inlength;
while (tmpinpos < finalpos) {
- final int howmany = decompressblock(out, currentPos, in, tmpinpos,
- outlength);
+ final int howmany = decompressblock(out, currentPos, in, tmpinpos, outlength);
outlength -= howmany;
currentPos += howmany;
tmpinpos += 1;
@@ -155,20 +148,18 @@ private static int[][] shiftme(int[][] x) {
}
return answer;
}
-
+
@Override
- public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
+ public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
if (inlength == 0)
- return;
+ return;
out[outpos.get()] = inlength;
outpos.increment();
- headlessCompress(in, inpos, inlength, out, outpos);
+ headlessCompress(in, inpos, inlength, out, outpos);
}
@Override
- public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
+ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
if (inlength == 0)
return;
final int outlength = in[inpos.get()];
@@ -176,28 +167,25 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
headlessUncompress(in, inpos, inlength, out, outpos, outlength);
}
+
@Override
public String toString() {
- return this.getClass().getSimpleName();
+ return this.getClass().getSimpleName();
}
private static final int S16_NUMSIZE = 16;
private static final int S16_BITSSIZE = 28;
// the possible number of bits used to represent one integer
- private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6,
- 5, 5, 4, 3, 2, 1 };
+ private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1 };
// the corresponding number of elements for each value of the number of bits
private static final int[][] S16_BITS = {
- { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
{ 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
{ 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 },
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2 },
- { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
- { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
- { 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 },
- { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
+ { 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 }, { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
{ 7, 7, 7, 7 }, { 10, 9, 9, }, { 14, 14 }, { 28 } };
private static final int[][] SHIFTED_S16_BITS = shiftme(S16_BITS);
-}
\ No newline at end of file
+}
diff --git a/src/main/java/me/lemire/integercompression/Simple9.java b/src/main/java/me/lemire/integercompression/Simple9.java
index 5703b04..fd5194d 100644
--- a/src/main/java/me/lemire/integercompression/Simple9.java
+++ b/src/main/java/me/lemire/integercompression/Simple9.java
@@ -7,10 +7,9 @@
package me.lemire.integercompression;
-
/**
- * This is an implementation of the popular Simple9 scheme.
- * It is limited to 28-bit integers (between 0 and 2^28-1).
+ * This is an implementation of the popular Simple9 scheme. It is limited to
+ * 28-bit integers (between 0 and 2^28-1).
*
* Note that this does not use differential coding: if you are working on sorted
* lists, you must compute the deltas separately.
@@ -19,296 +18,288 @@
*
*/
public final class Simple9 implements IntegerCODEC, SkippableIntegerCODEC {
- @Override
- public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
- int out[], IntWrapper outpos) {
- int tmpoutpos = outpos.get();
- int currentPos = inpos.get();
- final int finalin = currentPos + inlength;
- outer: while (currentPos < finalin - 28) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int res = 0;
- int compressedNum = codeNum[selector];
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++) {
- if (max <= in[currentPos + i])
- continue mainloop;
- res = (res << b) + in[currentPos + i];
- }
- res |= selector << 28;
- out[tmpoutpos++] = res;
- currentPos += compressedNum;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- out[tmpoutpos++] = in[currentPos++] | (selector << 28);
- }
- outer: while (currentPos < finalin) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int res = 0;
- int compressedNum = codeNum[selector];
- if (finalin <= currentPos + compressedNum - 1)
- compressedNum = finalin - currentPos;
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++) {
- if (max <= in[currentPos + i])
- continue mainloop;
- res = (res << b) + in[currentPos + i];
- }
- if (compressedNum != codeNum[selector])
- res <<= (codeNum[selector] - compressedNum)
- * b;
- res |= selector << 28;
- out[tmpoutpos++] = res;
- currentPos += compressedNum;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- out[tmpoutpos++] = in[currentPos++] | (selector << 28);
- }
- inpos.set(currentPos);
- outpos.set(tmpoutpos);
- }
+ @Override
+ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
+ int tmpoutpos = outpos.get();
+ int currentPos = inpos.get();
+ final int finalin = currentPos + inlength;
+ outer: while (currentPos < finalin - 28) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
- @Override
- public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
- int[] out, IntWrapper outpos, int outlength) {
- int currentPos = outpos.get();
- int tmpinpos = inpos.get();
- final int finalout = currentPos + outlength;
- while (currentPos < finalout - 28) {
- int val = in[tmpinpos++];
- int header = val >>> 28;
- switch (header) {
- case 0: { // number : 28, bitwidth : 1
- out[currentPos++] = (val << 4) >>> 31;
- out[currentPos++] = (val << 5) >>> 31;
- out[currentPos++] = (val << 6) >>> 31;
- out[currentPos++] = (val << 7) >>> 31;
- out[currentPos++] = (val << 8) >>> 31;
- out[currentPos++] = (val << 9) >>> 31;
- out[currentPos++] = (val << 10) >>> 31;
- out[currentPos++] = (val << 11) >>> 31;
- out[currentPos++] = (val << 12) >>> 31;
- out[currentPos++] = (val << 13) >>> 31; // 10
- out[currentPos++] = (val << 14) >>> 31;
- out[currentPos++] = (val << 15) >>> 31;
- out[currentPos++] = (val << 16) >>> 31;
- out[currentPos++] = (val << 17) >>> 31;
- out[currentPos++] = (val << 18) >>> 31;
- out[currentPos++] = (val << 19) >>> 31;
- out[currentPos++] = (val << 20) >>> 31;
- out[currentPos++] = (val << 21) >>> 31;
- out[currentPos++] = (val << 22) >>> 31;
- out[currentPos++] = (val << 23) >>> 31; // 20
- out[currentPos++] = (val << 24) >>> 31;
- out[currentPos++] = (val << 25) >>> 31;
- out[currentPos++] = (val << 26) >>> 31;
- out[currentPos++] = (val << 27) >>> 31;
- out[currentPos++] = (val << 28) >>> 31;
- out[currentPos++] = (val << 29) >>> 31;
- out[currentPos++] = (val << 30) >>> 31;
- out[currentPos++] = (val << 31) >>> 31;
- break;
- }
- case 1: { // number : 14, bitwidth : 2
- out[currentPos++] = (val << 4) >>> 30;
- out[currentPos++] = (val << 6) >>> 30;
- out[currentPos++] = (val << 8) >>> 30;
- out[currentPos++] = (val << 10) >>> 30;
- out[currentPos++] = (val << 12) >>> 30;
- out[currentPos++] = (val << 14) >>> 30;
- out[currentPos++] = (val << 16) >>> 30;
- out[currentPos++] = (val << 18) >>> 30;
- out[currentPos++] = (val << 20) >>> 30;
- out[currentPos++] = (val << 22) >>> 30; // 10
- out[currentPos++] = (val << 24) >>> 30;
- out[currentPos++] = (val << 26) >>> 30;
- out[currentPos++] = (val << 28) >>> 30;
- out[currentPos++] = (val << 30) >>> 30;
- break;
- }
- case 2: { // number : 9, bitwidth : 3
- out[currentPos++] = (val << 5) >>> 29;
- out[currentPos++] = (val << 8) >>> 29;
- out[currentPos++] = (val << 11) >>> 29;
- out[currentPos++] = (val << 14) >>> 29;
- out[currentPos++] = (val << 17) >>> 29;
- out[currentPos++] = (val << 20) >>> 29;
- out[currentPos++] = (val << 23) >>> 29;
- out[currentPos++] = (val << 26) >>> 29;
- out[currentPos++] = (val << 29) >>> 29;
- break;
- }
- case 3: { // number : 7, bitwidth : 4
- out[currentPos++] = (val << 4) >>> 28;
- out[currentPos++] = (val << 8) >>> 28;
- out[currentPos++] = (val << 12) >>> 28;
- out[currentPos++] = (val << 16) >>> 28;
- out[currentPos++] = (val << 20) >>> 28;
- out[currentPos++] = (val << 24) >>> 28;
- out[currentPos++] = (val << 28) >>> 28;
- break;
- }
- case 4: { // number : 5, bitwidth : 5
- out[currentPos++] = (val << 7) >>> 27;
- out[currentPos++] = (val << 12) >>> 27;
- out[currentPos++] = (val << 17) >>> 27;
- out[currentPos++] = (val << 22) >>> 27;
- out[currentPos++] = (val << 27) >>> 27;
- break;
- }
- case 5: { // number : 4, bitwidth : 7
- out[currentPos++] = (val << 4) >>> 25;
- out[currentPos++] = (val << 11) >>> 25;
- out[currentPos++] = (val << 18) >>> 25;
- out[currentPos++] = (val << 25) >>> 25;
- break;
- }
- case 6: { // number : 3, bitwidth : 9
- out[currentPos++] = (val << 5) >>> 23;
- out[currentPos++] = (val << 14) >>> 23;
- out[currentPos++] = (val << 23) >>> 23;
- break;
- }
- case 7: { // number : 2, bitwidth : 14
- out[currentPos++] = (val << 4) >>> 18;
- out[currentPos++] = (val << 18) >>> 18;
- break;
- }
- case 8: { // number : 1, bitwidth : 28
- out[currentPos++] = (val << 4) >>> 4;
- break;
- }
- default: {
- throw new RuntimeException("shouldn't happen: limited to 28-bit integers");
- }
- }
+ int res = 0;
+ int compressedNum = codeNum[selector];
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++) {
+ if (max <= in[currentPos + i])
+ continue mainloop;
+ res = (res << b) + in[currentPos + i];
}
- while (currentPos < finalout) {
- int val = in[tmpinpos++];
- int header = val >>> 28;
- switch (header) {
- case 0: { // number : 28, bitwidth : 1
- final int howmany = finalout - currentPos;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (k + 4)) >>> 31;
- }
- break;
- }
- case 1: { // number : 14, bitwidth : 2
- final int howmany = finalout - currentPos < 14 ? finalout
- - currentPos
- : 14;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (2 * k + 4)) >>> 30;
- }
- break;
- }
- case 2: { // number : 9, bitwidth : 3
- final int howmany = finalout - currentPos < 9 ? finalout
- - currentPos
- : 9;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (3 * k + 5)) >>> 29;
- }
- break;
- }
- case 3: { // number : 7, bitwidth : 4
- final int howmany = finalout - currentPos < 7 ? finalout
- - currentPos
- : 7;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (4 * k + 4)) >>> 28;
- }
- break;
- }
- case 4: { // number : 5, bitwidth : 5
- final int howmany = finalout - currentPos < 5 ? finalout
- - currentPos
- : 5;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (5 * k + 7)) >>> 27;
- }
- break;
- }
- case 5: { // number : 4, bitwidth : 7
- final int howmany = finalout - currentPos < 4 ? finalout
- - currentPos
- : 4;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (7 * k + 4)) >>> 25;
- }
- break;
- }
- case 6: { // number : 3, bitwidth : 9
- final int howmany = finalout - currentPos < 3 ? finalout
- - currentPos
- : 3;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (9 * k + 5)) >>> 23;
- }
- break;
- }
- case 7: { // number : 2, bitwidth : 14
- final int howmany = finalout - currentPos < 2 ? finalout
- - currentPos
- : 2;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (14 * k + 4)) >>> 18;
- }
- break;
- }
- case 8: { // number : 1, bitwidth : 28
- out[currentPos++] = (val << 4) >>> 4;
- break;
- }
- default: {
- throw new RuntimeException("shouldn't happen");
- }
- }
+ res |= selector << 28;
+ out[tmpoutpos++] = res;
+ currentPos += compressedNum;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+ }
+ outer: while (currentPos < finalin) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
+ int res = 0;
+ int compressedNum = codeNum[selector];
+ if (finalin <= currentPos + compressedNum - 1)
+ compressedNum = finalin - currentPos;
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++) {
+ if (max <= in[currentPos + i])
+ continue mainloop;
+ res = (res << b) + in[currentPos + i];
}
- outpos.set(currentPos);
- inpos.set(tmpinpos);
+ if (compressedNum != codeNum[selector])
+ res <<= (codeNum[selector] - compressedNum) * b;
+ res |= selector << 28;
+ out[tmpoutpos++] = res;
+ currentPos += compressedNum;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ out[tmpoutpos++] = in[currentPos++] | (selector << 28);
}
- @Override
- public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
- if (inlength == 0)
- return;
- out[outpos.get()] = inlength;
- outpos.increment();
- headlessCompress(in, inpos, inlength, out, outpos);
+ inpos.set(currentPos);
+ outpos.set(tmpoutpos);
+ }
+
+ @Override
+ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos,
+ int outlength) {
+ int currentPos = outpos.get();
+ int tmpinpos = inpos.get();
+ final int finalout = currentPos + outlength;
+ while (currentPos < finalout - 28) {
+ int val = in[tmpinpos++];
+ int header = val >>> 28;
+ switch (header) {
+ case 0: { // number : 28, bitwidth : 1
+ out[currentPos++] = (val << 4) >>> 31;
+ out[currentPos++] = (val << 5) >>> 31;
+ out[currentPos++] = (val << 6) >>> 31;
+ out[currentPos++] = (val << 7) >>> 31;
+ out[currentPos++] = (val << 8) >>> 31;
+ out[currentPos++] = (val << 9) >>> 31;
+ out[currentPos++] = (val << 10) >>> 31;
+ out[currentPos++] = (val << 11) >>> 31;
+ out[currentPos++] = (val << 12) >>> 31;
+ out[currentPos++] = (val << 13) >>> 31; // 10
+ out[currentPos++] = (val << 14) >>> 31;
+ out[currentPos++] = (val << 15) >>> 31;
+ out[currentPos++] = (val << 16) >>> 31;
+ out[currentPos++] = (val << 17) >>> 31;
+ out[currentPos++] = (val << 18) >>> 31;
+ out[currentPos++] = (val << 19) >>> 31;
+ out[currentPos++] = (val << 20) >>> 31;
+ out[currentPos++] = (val << 21) >>> 31;
+ out[currentPos++] = (val << 22) >>> 31;
+ out[currentPos++] = (val << 23) >>> 31; // 20
+ out[currentPos++] = (val << 24) >>> 31;
+ out[currentPos++] = (val << 25) >>> 31;
+ out[currentPos++] = (val << 26) >>> 31;
+ out[currentPos++] = (val << 27) >>> 31;
+ out[currentPos++] = (val << 28) >>> 31;
+ out[currentPos++] = (val << 29) >>> 31;
+ out[currentPos++] = (val << 30) >>> 31;
+ out[currentPos++] = (val << 31) >>> 31;
+ break;
+ }
+ case 1: { // number : 14, bitwidth : 2
+ out[currentPos++] = (val << 4) >>> 30;
+ out[currentPos++] = (val << 6) >>> 30;
+ out[currentPos++] = (val << 8) >>> 30;
+ out[currentPos++] = (val << 10) >>> 30;
+ out[currentPos++] = (val << 12) >>> 30;
+ out[currentPos++] = (val << 14) >>> 30;
+ out[currentPos++] = (val << 16) >>> 30;
+ out[currentPos++] = (val << 18) >>> 30;
+ out[currentPos++] = (val << 20) >>> 30;
+ out[currentPos++] = (val << 22) >>> 30; // 10
+ out[currentPos++] = (val << 24) >>> 30;
+ out[currentPos++] = (val << 26) >>> 30;
+ out[currentPos++] = (val << 28) >>> 30;
+ out[currentPos++] = (val << 30) >>> 30;
+ break;
+ }
+ case 2: { // number : 9, bitwidth : 3
+ out[currentPos++] = (val << 5) >>> 29;
+ out[currentPos++] = (val << 8) >>> 29;
+ out[currentPos++] = (val << 11) >>> 29;
+ out[currentPos++] = (val << 14) >>> 29;
+ out[currentPos++] = (val << 17) >>> 29;
+ out[currentPos++] = (val << 20) >>> 29;
+ out[currentPos++] = (val << 23) >>> 29;
+ out[currentPos++] = (val << 26) >>> 29;
+ out[currentPos++] = (val << 29) >>> 29;
+ break;
+ }
+ case 3: { // number : 7, bitwidth : 4
+ out[currentPos++] = (val << 4) >>> 28;
+ out[currentPos++] = (val << 8) >>> 28;
+ out[currentPos++] = (val << 12) >>> 28;
+ out[currentPos++] = (val << 16) >>> 28;
+ out[currentPos++] = (val << 20) >>> 28;
+ out[currentPos++] = (val << 24) >>> 28;
+ out[currentPos++] = (val << 28) >>> 28;
+ break;
+ }
+ case 4: { // number : 5, bitwidth : 5
+ out[currentPos++] = (val << 7) >>> 27;
+ out[currentPos++] = (val << 12) >>> 27;
+ out[currentPos++] = (val << 17) >>> 27;
+ out[currentPos++] = (val << 22) >>> 27;
+ out[currentPos++] = (val << 27) >>> 27;
+ break;
+ }
+ case 5: { // number : 4, bitwidth : 7
+ out[currentPos++] = (val << 4) >>> 25;
+ out[currentPos++] = (val << 11) >>> 25;
+ out[currentPos++] = (val << 18) >>> 25;
+ out[currentPos++] = (val << 25) >>> 25;
+ break;
+ }
+ case 6: { // number : 3, bitwidth : 9
+ out[currentPos++] = (val << 5) >>> 23;
+ out[currentPos++] = (val << 14) >>> 23;
+ out[currentPos++] = (val << 23) >>> 23;
+ break;
+ }
+ case 7: { // number : 2, bitwidth : 14
+ out[currentPos++] = (val << 4) >>> 18;
+ out[currentPos++] = (val << 18) >>> 18;
+ break;
+ }
+ case 8: { // number : 1, bitwidth : 28
+ out[currentPos++] = (val << 4) >>> 4;
+ break;
+ }
+ default: {
+ throw new RuntimeException("shouldn't happen: limited to 28-bit integers");
+ }
+ }
}
+ while (currentPos < finalout) {
+ int val = in[tmpinpos++];
+ int header = val >>> 28;
+ switch (header) {
+ case 0: { // number : 28, bitwidth : 1
+ final int howmany = finalout - currentPos;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (k + 4)) >>> 31;
+ }
+ break;
+ }
+ case 1: { // number : 14, bitwidth : 2
+ final int howmany = finalout - currentPos < 14 ? finalout - currentPos : 14;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+ }
+ break;
+ }
+ case 2: { // number : 9, bitwidth : 3
+ final int howmany = finalout - currentPos < 9 ? finalout - currentPos : 9;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+ }
+ break;
+ }
+ case 3: { // number : 7, bitwidth : 4
+ final int howmany = finalout - currentPos < 7 ? finalout - currentPos : 7;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+ }
+ break;
+ }
+ case 4: { // number : 5, bitwidth : 5
+ final int howmany = finalout - currentPos < 5 ? finalout - currentPos : 5;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+ }
+ break;
+ }
+ case 5: { // number : 4, bitwidth : 7
+ final int howmany = finalout - currentPos < 4 ? finalout - currentPos : 4;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+ }
+ break;
+ }
+ case 6: { // number : 3, bitwidth : 9
+ final int howmany = finalout - currentPos < 3 ? finalout - currentPos : 3;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+ }
+ break;
+ }
+ case 7: { // number : 2, bitwidth : 14
+ final int howmany = finalout - currentPos < 2 ? finalout - currentPos : 2;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+ }
+ break;
+ }
+ case 8: { // number : 1, bitwidth : 28
+ out[currentPos++] = (val << 4) >>> 4;
+ break;
+ }
+ default: {
+ throw new RuntimeException("shouldn't happen");
+ }
+ }
+ }
+ outpos.set(currentPos);
+ inpos.set(tmpinpos);
- @Override
- public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
- if (inlength == 0)
- return;
- final int outlength = in[inpos.get()];
- inpos.increment();
- headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+ }
- }
- private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ compressedPositions.add(inlength);
+ return inlength;
+ }
- private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+ @Override
+ public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+ if (inlength == 0)
+ return;
+ out[outpos.get()] = inlength;
+ outpos.increment();
+ headlessCompress(in, inpos, inlength, out, outpos);
+ }
- @Override
- public String toString() {
- return this.getClass().getSimpleName();
- }
+ @Override
+ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+ if (inlength == 0)
+ return;
+ final int outlength = in[inpos.get()];
+ inpos.increment();
+ headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+
+ }
+
+ private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+
+ private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+
+ @Override
+ public String toString() {
+ return this.getClass().getSimpleName();
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/SkippableComposition.java b/src/main/java/me/lemire/integercompression/SkippableComposition.java
index ed0f0de..fc3c18e 100644
--- a/src/main/java/me/lemire/integercompression/SkippableComposition.java
+++ b/src/main/java/me/lemire/integercompression/SkippableComposition.java
@@ -38,7 +38,12 @@ public SkippableComposition(SkippableIntegerCODEC f1,
public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos) {
int init = inpos.get();
+ int outposInit = outpos.get();
F1.headlessCompress(in, inpos, inlength, out, outpos);
+ if (outpos.get() == outposInit) {
+ out[outposInit] = 0;
+ outpos.increment();
+ }
inlength -= inpos.get() - init;
F2.headlessCompress(in, inpos, inlength, out, outpos);
}
@@ -47,12 +52,27 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos, int num) {
int init = inpos.get();
+ int outposInit = outpos.get();
+
F1.headlessUncompress(in, inpos, inlength, out, outpos, num);
+ if (inpos.get() == init) {
+ inpos.increment();
+ }
inlength -= inpos.get() - init;
- num -= outpos.get();
+ num -= outpos.get() - outposInit;
F2.headlessUncompress(in, inpos, inlength, out, outpos, num);
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int init = compressedPositions.get();
+ int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+ maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+ inlength -= compressedPositions.get() - init;
+ maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+ return maxLength;
+ }
+
@Override
public String toString() {
return F1.toString() + "+" + F2.toString();
diff --git a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
index c10d2f0..b9bdc04 100644
--- a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
@@ -10,10 +10,11 @@
/**
* Interface describing a standard CODEC to compress integers. This is a
- * variation on the IntegerCODEC interface meant to be used for random access.
+ * variation on the IntegerCODEC interface meant to be used for random access
+ * (i.e., given a large array, you can segment it and decode just the subarray you need).
*
- * The main difference is that we must specify the number of integers we wish to
- * decode. This information should be stored elsewhere.
+ * The main difference is that you must specify the number of integers you wish to
+ * uncompress. This information should be stored elsewhere.
*
* This interface was designed by the Terrier team for their search engine.
*
@@ -25,14 +26,17 @@ public interface SkippableIntegerCODEC {
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was read
- * and written to if 12 ints (inlength = 12) are compressed to 3 ints, then
- * inpos will be incremented by 12 while outpos will be incremented by 3 we
+ * and written to. If 12 ints (inlength = 12) are compressed to 3 ints, then
+ * inpos will be incremented by 12 while outpos will be incremented by 3. We
* use IntWrapper to pass the values by reference.
*
+ * Implementation note: contrary to {@link IntegerCODEC#compress},
+ * this may skip writing information about the number of encoded integers.
+ *
* @param in
* input array
* @param inpos
- * location in the input array
+ * where to start reading in the array
* @param inlength
* how many integers to compress
* @param out
@@ -56,13 +60,30 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
* @param inlength
* length of the compressed data (ignored by some schemes)
* @param out
- * array where to write the compressed output
+ * array where to write the uncompressed output
* @param outpos
- * where to write the compressed output in out
+ * where to start writing the uncompressed output in out
* @param num
- * number of integers we want to decode, the actual number of integers decoded can be less
+ * number of integers we want to decode. May be less than the actual number of compressed integers
*/
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos, int num);
+ /**
+ * Compute the maximum number of integers that might be required to store
+ * the compressed form of a given input array segment, without headers.
+ *
+ * This is useful to pre-allocate the output buffer before calling
+ * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper)}.
+ *
+ *
+ * @param compressedPositions
+ * since not all schemes compress every input integer, this parameter
+ * returns how many input integers will actually be compressed.
+ * This is useful when composing multiple schemes.
+ * @param inlength
+ * number of integers to be compressed
+ * @return the maximum number of integers needed in the output array
+ */
+ int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
}
diff --git a/src/main/java/me/lemire/integercompression/Util.java b/src/main/java/me/lemire/integercompression/Util.java
index 70e46b7..63fc918 100644
--- a/src/main/java/me/lemire/integercompression/Util.java
+++ b/src/main/java/me/lemire/integercompression/Util.java
@@ -13,6 +13,14 @@
*
*/
public final class Util {
+
+
+
+ // check whether x is small than y as unsigned ints (supported by Java 8 natively);
+ protected static final boolean smallerorequalthan(int x, int y) {
+ return (x + Integer.MIN_VALUE) <= (y + Integer.MIN_VALUE);
+ }
+
/**
* Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range
* of value
diff --git a/src/main/java/me/lemire/integercompression/VariableByte.java b/src/main/java/me/lemire/integercompression/VariableByte.java
index 8e3ce12..c9b04d0 100644
--- a/src/main/java/me/lemire/integercompression/VariableByte.java
+++ b/src/main/java/me/lemire/integercompression/VariableByte.java
@@ -21,6 +21,8 @@
*/
public class VariableByte implements IntegerCODEC, ByteIntegerCODEC, SkippableIntegerCODEC {
+ private static final int MAX_BYTES_PER_INT = 5;
+
private static byte extract7bits(int i, long val) {
return (byte) ((val >> (7 * i)) & ((1 << 7) - 1));
}
@@ -39,7 +41,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
IntWrapper outpos) {
if (inlength == 0)
return;
- ByteBuffer buf = ByteBuffer.allocateDirect(inlength * 8);
+ ByteBuffer buf = makeBuffer(inlength * 8);
buf.order(ByteOrder.LITTLE_ENDIAN);
for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
final long val = in[k] & 0xFFFFFFFFL; // To be consistent with
@@ -122,8 +124,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
for (int v = 0, shift = 0; p < finalp;) {
val = in[p];
int c = (byte) (val >>> s);
+ // Shift to next byte
s += 8;
+ // Shift to next integer if s==32
p += s>>5;
+ // cycle from 31 to 0
s = s & 31;
v += ((c & 127) << shift);
if ((c & 128) == 128) {
@@ -187,8 +192,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
val = in[p];
int c = val >>> s;
+ // Shift to next byte
s += 8;
+ // Shift to next integer if s==32
p += s>>5;
+ // cycle from 31 to 0
s = s & 31;
v += ((c & 127) << shift);
if ((c & 128) == 128) {
@@ -202,4 +210,25 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
inpos.set(p + (s!=0 ? 1 : 0));
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+ int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES;
+ compressedPositions.add(inlength);
+ return maxLengthInInts;
+ }
+
+ /**
+ * Creates a new buffer of the requested size.
+ *
+ * In case you need a different way to allocate buffers, you can override this method
+ * with a custom behavior. The default implementation allocates a new Java direct
+ * {@link ByteBuffer} on each invocation.
+ *
+ * @param sizeInBytes
+ * @return
+ */
+ protected ByteBuffer makeBuffer(int sizeInBytes) {
+ return ByteBuffer.allocateDirect(sizeInBytes);
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
index 847a28a..ef4a386 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
@@ -15,6 +15,7 @@
import me.lemire.integercompression.DeltaZigzagVariableByte;
import me.lemire.integercompression.FastPFOR;
import me.lemire.integercompression.FastPFOR128;
+import me.lemire.integercompression.GroupSimple9;
import me.lemire.integercompression.IntWrapper;
import me.lemire.integercompression.IntegerCODEC;
import me.lemire.integercompression.JustCopy;
@@ -24,6 +25,7 @@
import me.lemire.integercompression.OptPFD;
import me.lemire.integercompression.OptPFDS16;
import me.lemire.integercompression.OptPFDS9;
+import me.lemire.integercompression.Simple16;
import me.lemire.integercompression.Simple9;
import me.lemire.integercompression.VariableByte;
import me.lemire.integercompression.differential.Delta;
@@ -153,7 +155,7 @@ private static void testCodec(PrintWriter csvLog, int sparsity,
+ data[k][m]
+ " found "
+ decompressBuffer[m]
- + " at " + m);
+ + " at " + m + " out of " + outpos.get());
}
}
}
@@ -306,10 +308,10 @@ private static void testByteCodec(PrintWriter csvLog, int sparsity,
public static void main(String args[]) throws FileNotFoundException {
System.out
.println("# benchmark based on the ClusterData model from:");
- System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
- System.out.println("# Index compression using 64-bit words.");
+ System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
+ System.out.println("# Index compression using 64-bit words.");
System.out
- .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
+ .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
System.out.println();
PrintWriter writer = null;
@@ -487,7 +489,6 @@ private static void test(PrintWriter csvLog, int N, int nbr, int repeat) {
int[][] data = generateTestData(cdg, N, nbr, sparsity);
System.out.println("# generating random data... ok.");
-
testCodec(csvLog, sparsity, new Composition(
new FastPFOR128(), new VariableByte()), data,
repeat, false);
@@ -635,6 +636,14 @@ private static void test(PrintWriter csvLog, int N, int nbr, int repeat) {
System.out.println();
+ testCodec(csvLog, sparsity, new Simple16(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new Simple16(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new Simple16(), data,
+ repeat, true);
+ System.out.println();
+
testCodec(csvLog, sparsity, new Simple9(), data,
repeat, false);
testCodec(csvLog, sparsity, new Simple9(), data,
@@ -643,6 +652,14 @@ private static void test(PrintWriter csvLog, int N, int nbr, int repeat) {
repeat, true);
System.out.println();
+ testCodec(csvLog, sparsity, new GroupSimple9(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new GroupSimple9(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new GroupSimple9(), data,
+ repeat, true);
+ System.out.println();
+
{
IntegerCODEC c = new Composition(
new XorBinaryPacking(),
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java
index d9243bd..c31411d 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java
@@ -88,7 +88,7 @@ private static void benchmarkSine(final PrintWriter csvWriter,
final IntegerCODEC[] codecs, final int count, final int length, final int mean,
final int range, final int freq) {
String dataProp = String.format(
- "(mean=%1$d range=%2$d freq=%2$d)", mean, range, freq);
+ "(mean=%1$d range=%2$d freq=%3$d)", mean, range, freq);
int[][] data = generateSineDataChunks(0, count, length, mean,
range, freq);
benchmark(csvWriter, "Sine " + dataProp, codecs, data,
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
index 58bbc4a..b930568 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
@@ -83,7 +83,6 @@ private static int decompressFromSkipTable(Object c, int[] compressed,
if (num > length - uncomppos.get())
num = length - uncomppos.get();
int location = metadata[metapos++];
- // System.out.println("location = "+location);
int initvalue = metadata[metapos++];
int outputlocation = uncomppos.get();
if (location != compressedpos.get())
@@ -242,10 +241,10 @@ private static void testCodec(PrintWriter csvLog, int sparsity, Object c,
*/
public static void main(String args[]) throws FileNotFoundException {
System.out.println("# benchmark based on the ClusterData model from:");
- System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
- System.out.println("# Index compression using 64-bit words.");
+ System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
+ System.out.println("# Index compression using 64-bit words.");
System.out
- .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
+ .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
System.out.println();
PrintWriter writer = null;
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
index 6d281c7..f50a367 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
@@ -7,6 +7,7 @@
package me.lemire.integercompression.differential;
+import me.lemire.integercompression.BitPacking;
import me.lemire.integercompression.IntWrapper;
import me.lemire.integercompression.Util;
@@ -48,7 +49,8 @@
public class IntegratedBinaryPacking implements IntegratedIntegerCODEC,
SkippableIntegratedIntegerCODEC {
- static final int BLOCK_SIZE = 32;
+ public static final int BLOCK_SIZE = 32;
+ private static final int MAX_BIT_WIDTH = Integer.SIZE;
@Override
public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
@@ -83,38 +85,38 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
int tmpoutpos = outpos.get();
+
int initoffset = initvalue.get();
initvalue.set(in[inpos.get()+inlength -1]);
int s = inpos.get();
for (; s + BLOCK_SIZE * 4 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 4) {
- final int mbits1 = Util.maxdiffbits(initoffset, in, s, 32);
+ final int mbits1 = Util.maxdiffbits(initoffset, in, s, BLOCK_SIZE);
int initoffset2 = in[s + 31];
- final int mbits2 = Util.maxdiffbits(initoffset2, in, s + 32, 32);
- int initoffset3 = in[s + 32 + 31];
+ final int mbits2 = Util.maxdiffbits(initoffset2, in, s + BLOCK_SIZE, BLOCK_SIZE);
+ int initoffset3 = in[s + BLOCK_SIZE + 31];
final int mbits3 = Util
- .maxdiffbits(initoffset3, in, s + 2 * 32, 32);
- int initoffset4 = in[s + 2 * 32 + 31];
+ .maxdiffbits(initoffset3, in, s + 2 * BLOCK_SIZE, BLOCK_SIZE);
+ int initoffset4 = in[s + 2 * BLOCK_SIZE + 31];
final int mbits4 = Util
- .maxdiffbits(initoffset4, in, s + 3 * 32, 32);
+ .maxdiffbits(initoffset4, in, s + 3 * BLOCK_SIZE, BLOCK_SIZE);
out[tmpoutpos++] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8)
| (mbits4);
IntegratedBitPacking.integratedpack(initoffset, in, s, out,
tmpoutpos, mbits1);
tmpoutpos += mbits1;
- IntegratedBitPacking.integratedpack(initoffset2, in, s + 32, out,
+ IntegratedBitPacking.integratedpack(initoffset2, in, s + BLOCK_SIZE, out,
tmpoutpos, mbits2);
tmpoutpos += mbits2;
- IntegratedBitPacking.integratedpack(initoffset3, in, s + 2 * 32,
+ IntegratedBitPacking.integratedpack(initoffset3, in, s + 2 * BLOCK_SIZE,
out, tmpoutpos, mbits3);
tmpoutpos += mbits3;
- IntegratedBitPacking.integratedpack(initoffset4, in, s + 3 * 32,
+ IntegratedBitPacking.integratedpack(initoffset4, in, s + 3 * BLOCK_SIZE,
out, tmpoutpos, mbits4);
tmpoutpos += mbits4;
- initoffset = in[s + 3 * 32 + 31];
+ initoffset = in[s + 3 * BLOCK_SIZE + 31];
}
for (; s < inpos.get() + inlength; s += BLOCK_SIZE ) {
-
- final int mbits = Util.maxdiffbits(initoffset, in, s, 32);
+ final int mbits = Util.maxdiffbits(initoffset, in, s, BLOCK_SIZE);
out[tmpoutpos++] = mbits;
IntegratedBitPacking.integratedpack(initoffset, in, s, out,
tmpoutpos, mbits);
@@ -128,7 +130,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
@Override
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
int[] out, IntWrapper outpos, int num, IntWrapper initvalue) {
- final int outlength = num;
+ final int outlength = Util.greatestMultiple(num, BLOCK_SIZE);
int tmpinpos = inpos.get();
int initoffset = initvalue.get();
int s = outpos.get();
@@ -137,23 +139,24 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
final int mbits2 = (in[tmpinpos] >>> 16) & 0xFF;
final int mbits3 = (in[tmpinpos] >>> 8) & 0xFF;
final int mbits4 = (in[tmpinpos]) & 0xFF;
+
++tmpinpos;
IntegratedBitPacking.integratedunpack(initoffset, in, tmpinpos,
out, s, mbits1);
tmpinpos += mbits1;
initoffset = out[s + 31];
IntegratedBitPacking.integratedunpack(initoffset, in, tmpinpos,
- out, s + 32, mbits2);
+ out, s + BLOCK_SIZE, mbits2);
tmpinpos += mbits2;
- initoffset = out[s + 32 + 31];
+ initoffset = out[s + BLOCK_SIZE + 31];
IntegratedBitPacking.integratedunpack(initoffset, in, tmpinpos,
- out, s + 2 * 32, mbits3);
+ out, s + 2 * BLOCK_SIZE, mbits3);
tmpinpos += mbits3;
- initoffset = out[s + 2 * 32 + 31];
+ initoffset = out[s + 2 * BLOCK_SIZE + 31];
IntegratedBitPacking.integratedunpack(initoffset, in, tmpinpos,
- out, s + 3 * 32, mbits4);
+ out, s + 3 * BLOCK_SIZE, mbits4);
tmpinpos += mbits4;
- initoffset = out[s + 3 * 32 + 31];
+ initoffset = out[s + 3 * BLOCK_SIZE + 31];
}
for (; s < outpos.get() + outlength; s += BLOCK_SIZE) {
final int mbits = in[tmpinpos];
@@ -168,4 +171,13 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
initvalue.set(initoffset);
inpos.set(tmpinpos);
}
+
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int blockCount = inlength / BLOCK_SIZE;
+ int headersSizeInInts = blockCount / Integer.BYTES + (blockCount % Integer.BYTES);
+ int blocksSizeInInts = blockCount * MAX_BIT_WIDTH;
+ compressedPositions.add(blockCount * BLOCK_SIZE);
+ return headersSizeInInts + blocksSizeInInts;
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
new file mode 100644
index 0000000..1d935c4
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
@@ -0,0 +1,66 @@
+package me.lemire.integercompression.differential;
+
+import java.util.Arrays;
+
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * This is a convenience class that wraps a codec to provide
+ * a "friendly" API. It is useful to compress sorted integers.
+ * If your integers are not sorted (not even nearly so), please
+ * consider the IntCompressor class instead.
+ *
+ */
+public class IntegratedIntCompressor {
+ SkippableIntegratedIntegerCODEC codec;
+ /**
+ * Constructor wrapping a codec.
+ *
+ * @param c the underlying codec
+ */
+ public IntegratedIntCompressor(SkippableIntegratedIntegerCODEC c) {
+ codec = c;
+ }
+
+ /**
+ * Constructor with default codec.
+ */
+ public IntegratedIntCompressor() {
+ codec = new SkippableIntegratedComposition(new IntegratedBinaryPacking(),
+ new IntegratedVariableByte());
+ }
+
+ /**
+ * Compress an array and returns the compressed result as a new array.
+ *
+ * @param input array to be compressed
+ * @return compressed array
+ */
+ public int[] compress(int[] input) {
+ int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length);
+ int [] compressed = new int[maxCompressedLength + 1]; // +1 to store the length of the input
+ compressed[0] = input.length;
+ IntWrapper outpos = new IntWrapper(1);
+ IntWrapper initvalue = new IntWrapper(0);
+ codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos, initvalue);
+ compressed = Arrays.copyOf(compressed,outpos.intValue());
+ return compressed;
+ }
+
+ /**
+ * Uncompress an array and returns the uncompressed result as a new array.
+ *
+ * @param compressed compressed array
+ * @return uncompressed array
+ */
+ public int[] uncompress(int[] compressed) {
+ int[] decompressed = new int[compressed[0]];
+ IntWrapper inpos = new IntWrapper(1);
+ codec.headlessUncompress(compressed, inpos,
+ compressed.length - inpos.intValue(),
+ decompressed, new IntWrapper(0),
+ decompressed.length, new IntWrapper(0));
+ return decompressed;
+ }
+
+}
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
index 4352ebb..a577031 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
@@ -24,6 +24,8 @@
public class IntegratedVariableByte implements IntegratedIntegerCODEC, IntegratedByteIntegerCODEC,
SkippableIntegratedIntegerCODEC {
+ private static final int MAX_BYTES_PER_INT = 5;
+
private static byte extract7bits(int i, long val) {
return (byte)((val >> (7 * i)) & ((1 << 7) - 1));
}
@@ -38,7 +40,7 @@ public void compress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
int initoffset = 0;
- ByteBuffer buf = ByteBuffer.allocateDirect(inlength * 8);
+ ByteBuffer buf = makeBuffer(inlength * 8);
buf.order(ByteOrder.LITTLE_ENDIAN);
for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
final long val = (in[k] - initoffset) & 0xFFFFFFFFL; // To be consistent with unsigned integers in C/C++
@@ -187,7 +189,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
return;
int initoffset = initvalue.get();
initvalue.set(in[inpos.get()+inlength -1]);
- ByteBuffer buf = ByteBuffer.allocateDirect(inlength * 8);
+ ByteBuffer buf = makeBuffer(inlength * 8);
buf.order(ByteOrder.LITTLE_ENDIAN);
for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
final long val = (in[k] - initoffset) & 0xFFFFFFFFL; // To be consistent with unsigned integers in C/C++
@@ -229,18 +231,22 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
int[] out, IntWrapper outpos, int num, IntWrapper initvalue) {
int s = 0;
int val = 0;
+
int p = inpos.get();
int initoffset = initvalue.get();
int tmpoutpos = outpos.get();
int finaloutpos = num + tmpoutpos;
for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
+
val = in[p];
- int c = val >>> s;
+ int c = (byte) (val >>> s);
s += 8;
p += s>>5;
s = s & 31;
v += ((c & 127) << shift);
+
if ((c & 128) == 128) {
+
out[tmpoutpos++] = (initoffset = initoffset + v);
v = 0;
shift = 0;
@@ -253,4 +259,22 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
inpos.set(p + (s!=0 ? 1 : 0));
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+ int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES;
+ compressedPositions.add(inlength);
+ return maxLengthInInts;
+ }
+
+ /**
+ * Creates a new buffer of the requested size.
+ *
+ * In case you need a different way to allocate buffers, you can override this method
+ * with a custom behavior. The default implementation allocates a new Java direct
+ * {@link ByteBuffer} on each invocation.
+ */
+ protected ByteBuffer makeBuffer(int sizeInBytes) {
+ return ByteBuffer.allocateDirect(sizeInBytes);
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
index 2dd79a4..4786ec5 100644
--- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
+++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
@@ -49,9 +49,11 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
final int init = inpos.get();
+ int outposInit = outpos.get();
+
F1.headlessCompress(in, inpos, inlength, out, outpos, initvalue);
- if (outpos.get() == 0) {
- out[0] = 0;
+ if (outpos.get() == outposInit) {
+ out[outposInit] = 0;
outpos.increment();
}
inlength -= inpos.get() - init;
@@ -64,10 +66,25 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
int init = inpos.get();
+ int outposInit = outpos.get();
+
F1.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue);
+ if (inpos.get() == init) {
+ inpos.increment();
+ }
inlength -= inpos.get() - init;
- num -= outpos.get();
+
+ num -= outpos.get() - outposInit;
F2.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue);
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int init = compressedPositions.get();
+ int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+ maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+ inlength -= compressedPositions.get() - init;
+ maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+ return maxLength;
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
index 8b7fd4b..e2df754 100644
--- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
@@ -71,4 +71,21 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos, int num, IntWrapper initvalue);
+ /**
+ * Compute the maximum number of integers that might be required to store
+ * the compressed form of a given input array segment, without headers.
+ *
+ * This is useful to pre-allocate the output buffer before calling
+ * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper, IntWrapper)}.
+ *
+ *
+ * @param compressedPositions
+ * since not all schemes compress every input integer, this parameter
+ * returns how many input integers will actually be compressed.
+ * This is useful when composing multiple schemes.
+ * @param inlength
+ * number of integers to be compressed
+ * @return the maximum number of integers needed in the output array
+ */
+ int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
}
diff --git a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
index bbd386a..a50497c 100644
--- a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
+++ b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
@@ -42,7 +42,7 @@ int[] generateUniformHash(int N, int Max) {
int[] ans = new int[N];
HashSet s = new HashSet();
while (s.size() < N)
- s.add(new Integer(this.rand.nextInt(Max)));
+ s.add(this.rand.nextInt(Max));
Iterator i = s.iterator();
for (int k = 0; k < N; ++k)
ans[k] = i.next().intValue();
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java
new file mode 100644
index 0000000..9b2e1ca
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java
@@ -0,0 +1,12790 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+package me.lemire.integercompression.vector;
+
+import java.util.Arrays;
+import jdk.incubator.vector.*;
+
+/**
+ * Vectorized bitpacking routines. This class is a version of the
+ * VectorBitPackerTerse class that with less branch instructions.
+ *
+ * The code is machine generated from VectorBitPackerTerse.java using helper
+ * classes.
+ *
+ */
+public class VectorBitPacker {
+ private static final VectorSpecies SPECIES_512 =
+ IntVector.SPECIES_512;
+ private static final VectorSpecies SPECIES_256 =
+ IntVector.SPECIES_256;
+ private static final int VLEN_512 = 16;
+ private static final int VLEN_256 = 8;
+ private static final int BLOCK_SIZE = 256;
+
+ private static final IntVector MASK_1 =
+ IntVector.broadcast(SPECIES_256, (1 << 1) - 1);
+ private static final IntVector MASK_2 =
+ IntVector.broadcast(SPECIES_512, (1 << 2) - 1);
+ private static final IntVector MASK_3 =
+ IntVector.broadcast(SPECIES_256, (1 << 3) - 1);
+ private static final IntVector MASK_4 =
+ IntVector.broadcast(SPECIES_512, (1 << 4) - 1);
+ private static final IntVector MASK_5 =
+ IntVector.broadcast(SPECIES_256, (1 << 5) - 1);
+ private static final IntVector MASK_6 =
+ IntVector.broadcast(SPECIES_512, (1 << 6) - 1);
+ private static final IntVector MASK_7 =
+ IntVector.broadcast(SPECIES_256, (1 << 7) - 1);
+ private static final IntVector MASK_8 =
+ IntVector.broadcast(SPECIES_512, (1 << 8) - 1);
+ private static final IntVector MASK_9 =
+ IntVector.broadcast(SPECIES_256, (1 << 9) - 1);
+ private static final IntVector MASK_10 =
+ IntVector.broadcast(SPECIES_512, (1 << 10) - 1);
+ private static final IntVector MASK_11 =
+ IntVector.broadcast(SPECIES_256, (1 << 11) - 1);
+ private static final IntVector MASK_12 =
+ IntVector.broadcast(SPECIES_512, (1 << 12) - 1);
+ private static final IntVector MASK_13 =
+ IntVector.broadcast(SPECIES_256, (1 << 13) - 1);
+ private static final IntVector MASK_14 =
+ IntVector.broadcast(SPECIES_512, (1 << 14) - 1);
+ private static final IntVector MASK_15 =
+ IntVector.broadcast(SPECIES_256, (1 << 15) - 1);
+ private static final IntVector MASK_16 =
+ IntVector.broadcast(SPECIES_512, (1 << 16) - 1);
+ private static final IntVector MASK_17 =
+ IntVector.broadcast(SPECIES_256, (1 << 17) - 1);
+ private static final IntVector MASK_18 =
+ IntVector.broadcast(SPECIES_512, (1 << 18) - 1);
+ private static final IntVector MASK_19 =
+ IntVector.broadcast(SPECIES_256, (1 << 19) - 1);
+ private static final IntVector MASK_20 =
+ IntVector.broadcast(SPECIES_512, (1 << 20) - 1);
+ private static final IntVector MASK_21 =
+ IntVector.broadcast(SPECIES_256, (1 << 21) - 1);
+ private static final IntVector MASK_22 =
+ IntVector.broadcast(SPECIES_512, (1 << 22) - 1);
+ private static final IntVector MASK_23 =
+ IntVector.broadcast(SPECIES_256, (1 << 23) - 1);
+ private static final IntVector MASK_24 =
+ IntVector.broadcast(SPECIES_512, (1 << 24) - 1);
+ private static final IntVector MASK_25 =
+ IntVector.broadcast(SPECIES_256, (1 << 25) - 1);
+ private static final IntVector MASK_26 =
+ IntVector.broadcast(SPECIES_512, (1 << 26) - 1);
+ private static final IntVector MASK_27 =
+ IntVector.broadcast(SPECIES_256, (1 << 27) - 1);
+ private static final IntVector MASK_28 =
+ IntVector.broadcast(SPECIES_512, (1 << 28) - 1);
+ private static final IntVector MASK_29 =
+ IntVector.broadcast(SPECIES_256, (1 << 29) - 1);
+ private static final IntVector MASK_30 =
+ IntVector.broadcast(SPECIES_512, (1 << 30) - 1);
+ private static final IntVector MASK_31 =
+ IntVector.broadcast(SPECIES_256, (1 << 31) - 1);
+
+ /**
+ * Pack 32 integers
+ *
+ * @param in
+ * source array
+ * @param inpos
+ * position in source array
+ * @param out
+ * output array
+ * @param outpos
+ * position in output array
+ * @param b
+ * number of bits to use per integer
+ */
+ public static void fastpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpack1(in, inpos, out, outpos);
+ break;
+ case 2:
+ fastpack2(in, inpos, out, outpos);
+ break;
+ case 3:
+ fastpack3(in, inpos, out, outpos);
+ break;
+ case 4:
+ fastpack4(in, inpos, out, outpos);
+ break;
+ case 5:
+ fastpack5(in, inpos, out, outpos);
+ break;
+ case 6:
+ fastpack6(in, inpos, out, outpos);
+ break;
+ case 7:
+ fastpack7(in, inpos, out, outpos);
+ break;
+ case 8:
+ fastpack8(in, inpos, out, outpos);
+ break;
+ case 9:
+ fastpack9(in, inpos, out, outpos);
+ break;
+ case 10:
+ fastpack10(in, inpos, out, outpos);
+ break;
+ case 11:
+ fastpack11(in, inpos, out, outpos);
+ break;
+ case 12:
+ fastpack12(in, inpos, out, outpos);
+ break;
+ case 13:
+ fastpack13(in, inpos, out, outpos);
+ break;
+ case 14:
+ fastpack14(in, inpos, out, outpos);
+ break;
+ case 15:
+ fastpack15(in, inpos, out, outpos);
+ break;
+ case 16:
+ fastpack16(in, inpos, out, outpos);
+ break;
+ case 17:
+ fastpack17(in, inpos, out, outpos);
+ break;
+ case 18:
+ fastpack18(in, inpos, out, outpos);
+ break;
+ case 19:
+ fastpack19(in, inpos, out, outpos);
+ break;
+ case 20:
+ fastpack20(in, inpos, out, outpos);
+ break;
+ case 21:
+ fastpack21(in, inpos, out, outpos);
+ break;
+ case 22:
+ fastpack22(in, inpos, out, outpos);
+ break;
+ case 23:
+ fastpack23(in, inpos, out, outpos);
+ break;
+ case 24:
+ fastpack24(in, inpos, out, outpos);
+ break;
+ case 25:
+ fastpack25(in, inpos, out, outpos);
+ break;
+ case 26:
+ fastpack26(in, inpos, out, outpos);
+ break;
+ case 27:
+ fastpack27(in, inpos, out, outpos);
+ break;
+ case 28:
+ fastpack28(in, inpos, out, outpos);
+ break;
+ case 29:
+ fastpack29(in, inpos, out, outpos);
+ break;
+ case 30:
+ fastpack30(in, inpos, out, outpos);
+ break;
+ case 31:
+ fastpack31(in, inpos, out, outpos);
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+ break;
+ }
+ }
+
+ static void fastpackNoMask(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpackNoMask1(in, inpos, out, outpos);
+ break;
+ case 2:
+ fastpackNoMask2(in, inpos, out, outpos);
+ break;
+ case 3:
+ fastpackNoMask3(in, inpos, out, outpos);
+ break;
+ case 4:
+ fastpackNoMask4(in, inpos, out, outpos);
+ break;
+ case 5:
+ fastpackNoMask5(in, inpos, out, outpos);
+ break;
+ case 6:
+ fastpackNoMask6(in, inpos, out, outpos);
+ break;
+ case 7:
+ fastpackNoMask7(in, inpos, out, outpos);
+ break;
+ case 8:
+ fastpackNoMask8(in, inpos, out, outpos);
+ break;
+ case 9:
+ fastpackNoMask9(in, inpos, out, outpos);
+ break;
+ case 10:
+ fastpackNoMask10(in, inpos, out, outpos);
+ break;
+ case 11:
+ fastpackNoMask11(in, inpos, out, outpos);
+ break;
+ case 12:
+ fastpackNoMask12(in, inpos, out, outpos);
+ break;
+ case 13:
+ fastpackNoMask13(in, inpos, out, outpos);
+ break;
+ case 14:
+ fastpackNoMask14(in, inpos, out, outpos);
+ break;
+ case 15:
+ fastpackNoMask15(in, inpos, out, outpos);
+ break;
+ case 16:
+ fastpackNoMask16(in, inpos, out, outpos);
+ break;
+ case 17:
+ fastpackNoMask17(in, inpos, out, outpos);
+ break;
+ case 18:
+ fastpackNoMask18(in, inpos, out, outpos);
+ break;
+ case 19:
+ fastpackNoMask19(in, inpos, out, outpos);
+ break;
+ case 20:
+ fastpackNoMask20(in, inpos, out, outpos);
+ break;
+ case 21:
+ fastpackNoMask21(in, inpos, out, outpos);
+ break;
+ case 22:
+ fastpackNoMask22(in, inpos, out, outpos);
+ break;
+ case 23:
+ fastpackNoMask23(in, inpos, out, outpos);
+ break;
+ case 24:
+ fastpackNoMask24(in, inpos, out, outpos);
+ break;
+ case 25:
+ fastpackNoMask25(in, inpos, out, outpos);
+ break;
+ case 26:
+ fastpackNoMask26(in, inpos, out, outpos);
+ break;
+ case 27:
+ fastpackNoMask27(in, inpos, out, outpos);
+ break;
+ case 28:
+ fastpackNoMask28(in, inpos, out, outpos);
+ break;
+ case 29:
+ fastpackNoMask29(in, inpos, out, outpos);
+ break;
+ case 30:
+ fastpackNoMask30(in, inpos, out, outpos);
+ break;
+ case 31:
+ fastpackNoMask31(in, inpos, out, outpos);
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+ break;
+ }
+ }
+
+ /**
+ * Unpack 32 integers
+ *
+ * @param in
+ * source array
+ * @param inpos
+ * position in source array
+ * @param out
+ * output array
+ * @param outpos
+ * position in output array
+ * @param b
+ * number of bits to use per integer
+ */
+ public static void fastunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ Arrays.fill(out, outpos, outpos + 256, 0);
+ break;
+ case 1:
+ fastunpack1(in, inpos, out, outpos);
+ break;
+ case 2:
+ fastunpack2(in, inpos, out, outpos);
+ break;
+ case 3:
+ fastunpack3(in, inpos, out, outpos);
+ break;
+ case 4:
+ fastunpack4(in, inpos, out, outpos);
+ break;
+ case 5:
+ fastunpack5(in, inpos, out, outpos);
+ break;
+ case 6:
+ fastunpack6(in, inpos, out, outpos);
+ break;
+ case 7:
+ fastunpack7(in, inpos, out, outpos);
+ break;
+ case 8:
+ fastunpack8(in, inpos, out, outpos);
+ break;
+ case 9:
+ fastunpack9(in, inpos, out, outpos);
+ break;
+ case 10:
+ fastunpack10(in, inpos, out, outpos);
+ break;
+ case 11:
+ fastunpack11(in, inpos, out, outpos);
+ break;
+ case 12:
+ fastunpack12(in, inpos, out, outpos);
+ break;
+ case 13:
+ fastunpack13(in, inpos, out, outpos);
+ break;
+ case 14:
+ fastunpack14(in, inpos, out, outpos);
+ break;
+ case 15:
+ fastunpack15(in, inpos, out, outpos);
+ break;
+ case 16:
+ fastunpack16(in, inpos, out, outpos);
+ break;
+ case 17:
+ fastunpack17(in, inpos, out, outpos);
+ break;
+ case 18:
+ fastunpack18(in, inpos, out, outpos);
+ break;
+ case 19:
+ fastunpack19(in, inpos, out, outpos);
+ break;
+ case 20:
+ fastunpack20(in, inpos, out, outpos);
+ break;
+ case 21:
+ fastunpack21(in, inpos, out, outpos);
+ break;
+ case 22:
+ fastunpack22(in, inpos, out, outpos);
+ break;
+ case 23:
+ fastunpack23(in, inpos, out, outpos);
+ break;
+ case 24:
+ fastunpack24(in, inpos, out, outpos);
+ break;
+ case 25:
+ fastunpack25(in, inpos, out, outpos);
+ break;
+ case 26:
+ fastunpack26(in, inpos, out, outpos);
+ break;
+ case 27:
+ fastunpack27(in, inpos, out, outpos);
+ break;
+ case 28:
+ fastunpack28(in, inpos, out, outpos);
+ break;
+ case 29:
+ fastunpack29(in, inpos, out, outpos);
+ break;
+ case 30:
+ fastunpack30(in, inpos, out, outpos);
+ break;
+ case 31:
+ fastunpack31(in, inpos, out, outpos);
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+ break;
+ }
+ }
+
+ public static int slowpack(final int[] in, int inpos, int inlen,
+ final int[] out, int outpos, int b) {
+ if (inlen == 0)
+ return outpos;
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, inlen);
+ return outpos + inlen;
+ }
+ int mask = (1 << b) - 1;
+ int c = 0;
+ int l = 0;
+ int r = 0;
+ int val = 0;
+ for (int i = 0; i < inlen; i++) {
+ val = in[inpos + i] & mask;
+ out[outpos] |= val << (c + r);
+ c += b;
+ l = (32 - r) % b;
+ if (c + r >= 32) {
+ if (i < inlen - 1 || l != 0)
+ outpos++;
+ r = l == 0 ? 0 : b - l;
+ if (l != 0)
+ out[outpos] = val >> (b - r);
+ c = 0;
+ }
+ }
+ return outpos;
+ }
+
+ public static int slowunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int outlen, int b) {
+ if (outlen == 0) {
+ return inpos;
+ }
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, outlen);
+ return inpos + outlen;
+ }
+ int mask = (1 << b) - 1;
+ int limit = outpos + outlen;
+ int r = 0;
+ int val = 0;
+ int i = 0;
+ for (; outpos < limit; i++) {
+ if (r > 0)
+ out[outpos++] =
+ (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask);
+ val = in[inpos + i];
+ int j = 0;
+ int l = 32 - r;
+ int ll = l % b == 0 ? l : l - b;
+ while (j < ll && outpos < limit) {
+ out[outpos++] = (val >> (j + r)) & mask;
+ j += b;
+ }
+ r = l % b == 0 ? 0 : b - (l % b);
+ }
+ return inpos + i;
+ }
+
+ public static int numCompressedInts(int n, int b) {
+ int width = b % 2 == 0 ? VLEN_512 : VLEN_256;
+ if (n <= width)
+ return n;
+ int intsPerVec = (32 / b) * width;
+ int q = (n + intsPerVec - 1) / intsPerVec;
+ return q * width;
+ }
+
+ private static void fastpack1(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_1);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack2(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_2);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack3(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_3);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack4(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_4);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack5(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_5);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack6(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_6);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack7(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_7);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack8(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_8);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack9(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_9);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack10(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_10);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack11(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_11);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack12(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_12);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack13(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_13);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack14(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_14);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack15(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_15);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack16(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_16);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack17(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_17);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack18(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_18);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack19(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_19);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack20(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_20);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack21(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_21);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack22(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_22);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack23(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_23);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack24(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_24);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack25(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_25);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack26(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_26);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack27(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_27);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack28(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_28);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack29(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_29);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack30(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_30);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack31(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_31);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask1(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask2(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask3(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask4(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask5(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask6(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask7(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask8(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask9(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask10(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask11(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask12(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask13(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask14(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask15(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask16(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask17(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask18(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask19(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask20(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask21(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask22(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask23(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask24(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask25(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask26(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask27(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask28(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask29(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask30(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask31(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastunpack1(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack2(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack3(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack4(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.and(MASK_4);
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack5(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack6(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack7(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack8(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.and(MASK_8);
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack9(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack10(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack11(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack12(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xfff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack13(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack14(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack15(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack16(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.and(MASK_16);
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack17(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack18(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack19(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack20(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xfffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack21(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack22(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack23(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack24(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack25(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack26(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack27(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack28(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_28).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xfffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack29(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack30(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_30).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack31(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_31).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3fffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x1fffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+}
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java
new file mode 100644
index 0000000..62a8cc7
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java
@@ -0,0 +1,963 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+package me.lemire.integercompression.vector;
+
+import java.util.Arrays;
+import jdk.incubator.vector.*;
+
+/**
+ * This is a readable but less efficient version of the VectorBitPacker class.
+ *
+ */
+public class VectorBitPackerTerse {
+ static final VectorSpecies SPECIES_512 = IntVector.SPECIES_512;
+ static final VectorSpecies SPECIES_256 = IntVector.SPECIES_256;
+ static final int VLEN_512 = 16;
+ static final int VLEN_256 = 8;
+ static final int BLOCK_SIZE = 256;
+
+ private static void fastpackOddBit(final int[] in, int inpos, final int[] out,
+ int outpos, int b, final int[] ho,
+ final int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 31 / b;
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(mask);
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.and(mask).lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ final int L = b - 1;
+ for (int i = 0; i < L; i++) {
+ oV = iV.and(mask).lanewise(VectorOperators.LSHR, ho[i]);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.and(mask)
+ .lanewise(VectorOperators.LSHL, b * j + (b - ho[i]))
+ .or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+ }
+
+ private static void fastpackOddBitNoMask(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] ho, final int[] lc) {
+ final int N = 31 / b;
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ final int L = b - 1;
+ for (int i = 0; i < L; i++) {
+ oV = iV.lanewise(VectorOperators.LSHR, ho[i]);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.lanewise(VectorOperators.LSHL, b * j + (b - ho[i])).or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+ }
+
+ private static void fastUnpackOddBit(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] lo, int[] masks, int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 32 / b;
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ int n = 0;
+ for (; n < N; n++) {
+ iV.lanewise(VectorOperators.LSHR, b * n).and(mask).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+ var oV = iV.lanewise(VectorOperators.LSHR, b * n).and(mask);
+
+ final int L = b - 1;
+ for (int i = 0; i < L; i++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + (i + 1) * VLEN_256);
+ oV = iV.and(masks[i]).lanewise(VectorOperators.LSHL, b - lo[i]).or(oV);
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+ int j = 0;
+ for (; j < lc[i]; j++) {
+ iV.lanewise(VectorOperators.LSHR, b * j + lo[i])
+ .and(mask)
+ .intoArray(out, outpos);
+ outpos += VLEN_256;
+ n++;
+ }
+ oV = iV.lanewise(VectorOperators.LSHR, b * j + lo[i]).and(mask);
+ }
+ }
+
+ private static void fastpackEvenBit(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] ho, final int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 32 % b == 0 ? (32 / b) - 1 : 32 / b;
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(mask);
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.and(mask).lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ final int L = (b >>> 1) - 1;
+ for (int i = 0; i < L; i++) {
+ if (ho[i] != b)
+ oV = iV.and(mask).lanewise(VectorOperators.LSHR, ho[i]);
+ else
+ oV = oV.zero(SPECIES_512);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.and(mask)
+ .lanewise(VectorOperators.LSHL, b * j + (b - ho[i]))
+ .or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+ }
+
+ private static void fastpackEvenBitNoMask(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] ho, final int[] lc) {
+ final int N = 32 % b == 0 ? (32 / b) - 1 : 32 / b;
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ final int L = (b >>> 1) - 1;
+ for (int i = 0; i < L; i++) {
+ if (ho[i] != b)
+ oV = iV.lanewise(VectorOperators.LSHR, ho[i]);
+ else
+ oV = oV.zero(SPECIES_512);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.lanewise(VectorOperators.LSHL, b * j + (b - ho[i])).or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+ }
+
+ private static void fastUnpackEventBit(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] lo, int[] masks,
+ int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 32 / b;
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ int n = 0;
+ for (; n < N; n++) {
+ iV.lanewise(VectorOperators.LSHR, b * n).and(mask).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+ var oV = iV.lanewise(VectorOperators.LSHR, b * n).and(mask);
+ if ((b & (b - 1)) == 0)
+ oV = oV.zero(SPECIES_512);
+
+ final int L = (b >>> 1) - 1;
+ for (int i = 0; i < L; i++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + (i + 1) * VLEN_512);
+ oV = iV.and(masks[i]).lanewise(VectorOperators.LSHL, b - lo[i]).or(oV);
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ int j = 0;
+ for (; j < lc[i]; j++) {
+ iV.lanewise(VectorOperators.LSHR, b * j + lo[i])
+ .and(mask)
+ .intoArray(out, outpos);
+ outpos += VLEN_512;
+ n++;
+ }
+ if ((32 - lo[i]) % b != 0)
+ oV = iV.lanewise(VectorOperators.LSHR, b * j + lo[i]).and(mask);
+ else
+ oV = oV.zero(SPECIES_512);
+ }
+ }
+
+ public static int slowpack(final int[] in, int inpos, int inlen,
+ final int[] out, int outpos, int b) {
+ if (inlen == 0)
+ return outpos;
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, inlen);
+ return outpos + inlen;
+ }
+ int mask = (1 << b) - 1;
+ int c = 0;
+ int l = 0;
+ int r = 0;
+ int val = 0;
+ for (int i = 0; i < inlen; i++) {
+ val = in[inpos + i] & mask;
+ out[outpos] |= val << (c + r);
+ c += b;
+ l = (32 - r) % b;
+ if (c + r >= 32) {
+ if (i < inlen - 1 || l != 0)
+ outpos++;
+ r = l == 0 ? 0 : b - l;
+ if (l != 0)
+ out[outpos] = val >> (b - r);
+ c = 0;
+ }
+ }
+ return outpos;
+ }
+
+ public static int slowunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int outlen, int b) {
+ if (outlen == 0) {
+ return inpos;
+ }
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, outlen);
+ return inpos + outlen;
+ }
+ int mask = (1 << b) - 1;
+ int limit = outpos + outlen;
+ int r = 0;
+ int val = 0;
+ int i = 0;
+ for (; outpos < limit; i++) {
+ if (r > 0)
+ out[outpos++] =
+ (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask);
+ val = in[inpos + i];
+ int j = 0;
+ int l = 32 - r;
+ int ll = l % b == 0 ? l : l - b;
+ while (j < ll && outpos < limit) {
+ out[outpos++] = (val >> (j + r)) & mask;
+ j += b;
+ }
+ r = l % b == 0 ? 0 : b - (l % b);
+ }
+ return inpos + i;
+ }
+
+ public static int numCompressedInts(int n, int b) {
+ int width = b % 2 == 0 ? VLEN_512 : VLEN_256;
+ if (n <= width)
+ return n;
+ int intsPerVec = (32 / b) * width;
+ int q = (n + intsPerVec - 1) / intsPerVec;
+ return q * width;
+ }
+
+ public static void fastpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpackOddBit(in, inpos, out, outpos, 1, new int[] {}, new int[] {});
+ break;
+ case 2:
+ fastpackEvenBit(in, inpos, out, outpos, 2, new int[] {}, new int[] {});
+ break;
+ case 3:
+ fastpackOddBit(in, inpos, out, outpos, 3, new int[] {0x2, 0x1},
+ new int[] {0xb, 0xa});
+ break;
+ case 4:
+ fastpackEvenBit(in, inpos, out, outpos, 4, new int[] {0x4},
+ new int[] {0x8});
+ break;
+ case 5:
+ fastpackOddBit(in, inpos, out, outpos, 5, new int[] {0x2, 0x4, 0x1, 0x3},
+ new int[] {0x6, 0x7, 0x6, 0x6});
+ break;
+ case 6:
+ fastpackEvenBit(in, inpos, out, outpos, 6, new int[] {0x2, 0x4},
+ new int[] {0x5, 0x5});
+ break;
+ case 7:
+ fastpackOddBit(in, inpos, out, outpos, 7,
+ new int[] {0x4, 0x1, 0x5, 0x2, 0x6, 0x3},
+ new int[] {0x5, 0x4, 0x5, 0x4, 0x5, 0x4});
+ break;
+ case 8:
+ fastpackEvenBit(in, inpos, out, outpos, 8, new int[] {0x8, 0x8, 0x8},
+ new int[] {0x4, 0x4, 0x4});
+ break;
+ case 9:
+ fastpackOddBit(in, inpos, out, outpos, 9,
+ new int[] {0x5, 0x1, 0x6, 0x2, 0x7, 0x3, 0x8, 0x4},
+ new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x3, 0x4, 0x3});
+ break;
+ case 10:
+ fastpackEvenBit(in, inpos, out, outpos, 10,
+ new int[] {0x2, 0x4, 0x6, 0x8},
+ new int[] {0x3, 0x3, 0x3, 0x3});
+ break;
+ case 11:
+ fastpackOddBit(
+ in, inpos, out, outpos, 11,
+ new int[] {0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+ new int[] {0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2});
+ break;
+ case 12:
+ fastpackEvenBit(in, inpos, out, outpos, 12,
+ new int[] {0x8, 0x4, 0xc, 0x8, 0x4},
+ new int[] {0x3, 0x2, 0x3, 0x3, 0x2});
+ break;
+ case 13:
+ fastpackOddBit(in, inpos, out, outpos, 13,
+ new int[] {0x6, 0xc, 0x5, 0xb, 0x4, 0xa, 0x3, 0x9, 0x2,
+ 0x8, 0x1, 0x7},
+ new int[] {0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2,
+ 0x3, 0x2, 0x2});
+ break;
+ case 14:
+ fastpackEvenBit(in, inpos, out, outpos, 14,
+ new int[] {0x4, 0x8, 0xc, 0x2, 0x6, 0xa},
+ new int[] {0x2, 0x2, 0x3, 0x2, 0x2, 0x2});
+ break;
+ case 15:
+ fastpackOddBit(in, inpos, out, outpos, 15,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x1, 0x3,
+ 0x5, 0x7, 0x9, 0xb, 0xd},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2, 0x2,
+ 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 16:
+ fastpackEvenBit(in, inpos, out, outpos, 16,
+ new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 17:
+ fastpackOddBit(in, inpos, out, outpos, 17,
+ new int[] {0xf, 0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1, 0x10,
+ 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x2,
+ 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 18:
+ fastpackEvenBit(in, inpos, out, outpos, 18,
+ new int[] {0xe, 0xa, 0x6, 0x2, 0x10, 0xc, 0x8, 0x4},
+ new int[] {0x2, 0x2, 0x2, 0x1, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 19:
+ fastpackOddBit(in, inpos, out, outpos, 19,
+ new int[] {0xd, 0x7, 0x1, 0xe, 0x8, 0x2, 0xf, 0x9, 0x3,
+ 0x10, 0xa, 0x4, 0x11, 0xb, 0x5, 0x12, 0xc, 0x6},
+ new int[] {0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1,
+ 0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1});
+ break;
+ case 20:
+ fastpackEvenBit(
+ in, inpos, out, outpos, 20,
+ new int[] {0xc, 0x4, 0x10, 0x8, 0x14, 0xc, 0x4, 0x10, 0x8},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 21:
+ fastpackOddBit(
+ in, inpos, out, outpos, 21,
+ new int[] {0xb, 0x1, 0xc, 0x2, 0xd, 0x3, 0xe, 0x4, 0xf, 0x5,
+ 0x10, 0x6, 0x11, 0x7, 0x12, 0x8, 0x13, 0x9, 0x14, 0xa},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 22:
+ fastpackEvenBit(
+ in, inpos, out, outpos, 22,
+ new int[] {0xa, 0x14, 0x8, 0x12, 0x6, 0x10, 0x4, 0xe, 0x2, 0xc},
+ new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 23:
+ fastpackOddBit(in, inpos, out, outpos, 23,
+ new int[] {0x9, 0x12, 0x4, 0xd, 0x16, 0x8, 0x11, 0x3,
+ 0xc, 0x15, 0x7, 0x10, 0x2, 0xb, 0x14, 0x6,
+ 0xf, 0x1, 0xa, 0x13, 0x5, 0xe},
+ new int[] {0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 24:
+ fastpackEvenBit(
+ in, inpos, out, outpos, 24,
+ new int[] {0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8,
+ 0x10},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 25:
+ fastpackOddBit(in, inpos, out, outpos, 25,
+ new int[] {0x7, 0xe, 0x15, 0x3, 0xa, 0x11, 0x18, 0x6,
+ 0xd, 0x14, 0x2, 0x9, 0x10, 0x17, 0x5, 0xc,
+ 0x13, 0x1, 0x8, 0xf, 0x16, 0x4, 0xb, 0x12},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1});
+ break;
+ case 26:
+ fastpackEvenBit(in, inpos, out, outpos, 26,
+ new int[] {0x6, 0xc, 0x12, 0x18, 0x4, 0xa, 0x10, 0x16,
+ 0x2, 0x8, 0xe, 0x14},
+ new int[] {0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1});
+ break;
+ case 27:
+ fastpackOddBit(in, inpos, out, outpos, 27,
+ new int[] {0x5, 0xa, 0xf, 0x14, 0x19, 0x3, 0x8,
+ 0xd, 0x12, 0x17, 0x1, 0x6, 0xb, 0x10,
+ 0x15, 0x1a, 0x4, 0x9, 0xe, 0x13, 0x18,
+ 0x2, 0x7, 0xc, 0x11, 0x16},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 28:
+ fastpackEvenBit(in, inpos, out, outpos, 28,
+ new int[] {0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x4,
+ 0x8, 0xc, 0x10, 0x14, 0x18},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 29:
+ fastpackOddBit(
+ in, inpos, out, outpos, 29,
+ new int[] {0x3, 0x6, 0x9, 0xc, 0xf, 0x12, 0x15, 0x18, 0x1b, 0x1,
+ 0x4, 0x7, 0xa, 0xd, 0x10, 0x13, 0x16, 0x19, 0x1c, 0x2,
+ 0x5, 0x8, 0xb, 0xe, 0x11, 0x14, 0x17, 0x1a},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 30:
+ fastpackEvenBit(in, inpos, out, outpos, 30,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12,
+ 0x14, 0x16, 0x18, 0x1a, 0x1c},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 31:
+ fastpackOddBit(in, inpos, out, outpos, 31,
+ new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+ 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, 256);
+ break;
+ }
+ }
+
+ public static void fastpackNoMask(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 1, new int[] {},
+ new int[] {});
+ break;
+ case 2:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 2, new int[] {},
+ new int[] {});
+ break;
+ case 3:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 3, new int[] {0x2, 0x1},
+ new int[] {0xb, 0xa});
+ break;
+ case 4:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 4, new int[] {0x4},
+ new int[] {0x8});
+ break;
+ case 5:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 5,
+ new int[] {0x2, 0x4, 0x1, 0x3},
+ new int[] {0x6, 0x7, 0x6, 0x6});
+ break;
+ case 6:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 6, new int[] {0x2, 0x4},
+ new int[] {0x5, 0x5});
+ break;
+ case 7:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 7,
+ new int[] {0x4, 0x1, 0x5, 0x2, 0x6, 0x3},
+ new int[] {0x5, 0x4, 0x5, 0x4, 0x5, 0x4});
+ break;
+ case 8:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 8,
+ new int[] {0x8, 0x8, 0x8},
+ new int[] {0x4, 0x4, 0x4});
+ break;
+ case 9:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 9,
+ new int[] {0x5, 0x1, 0x6, 0x2, 0x7, 0x3, 0x8, 0x4},
+ new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x3, 0x4, 0x3});
+ break;
+ case 10:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 10,
+ new int[] {0x2, 0x4, 0x6, 0x8},
+ new int[] {0x3, 0x3, 0x3, 0x3});
+ break;
+ case 11:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 11,
+ new int[] {0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+ new int[] {0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2});
+ break;
+ case 12:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 12,
+ new int[] {0x8, 0x4, 0xc, 0x8, 0x4},
+ new int[] {0x3, 0x2, 0x3, 0x3, 0x2});
+ break;
+ case 13:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 13,
+ new int[] {0x6, 0xc, 0x5, 0xb, 0x4, 0xa, 0x3, 0x9,
+ 0x2, 0x8, 0x1, 0x7},
+ new int[] {0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3,
+ 0x2, 0x3, 0x2, 0x2});
+ break;
+ case 14:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 14,
+ new int[] {0x4, 0x8, 0xc, 0x2, 0x6, 0xa},
+ new int[] {0x2, 0x2, 0x3, 0x2, 0x2, 0x2});
+ break;
+ case 15:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 15,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x1,
+ 0x3, 0x5, 0x7, 0x9, 0xb, 0xd},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2,
+ 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 16:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 16,
+ new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 17:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 17,
+ new int[] {0xf, 0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1,
+ 0x10, 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1,
+ 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 18:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 18,
+ new int[] {0xe, 0xa, 0x6, 0x2, 0x10, 0xc, 0x8, 0x4},
+ new int[] {0x2, 0x2, 0x2, 0x1, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 19:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 19,
+ new int[] {0xd, 0x7, 0x1, 0xe, 0x8, 0x2, 0xf, 0x9, 0x3, 0x10, 0xa,
+ 0x4, 0x11, 0xb, 0x5, 0x12, 0xc, 0x6},
+ new int[] {0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1,
+ 0x2, 0x2, 0x1, 0x2, 0x2, 0x1});
+ break;
+ case 20:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 20,
+ new int[] {0xc, 0x4, 0x10, 0x8, 0x14, 0xc, 0x4, 0x10, 0x8},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 21:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 21,
+ new int[] {0xb, 0x1, 0xc, 0x2, 0xd, 0x3, 0xe, 0x4, 0xf, 0x5,
+ 0x10, 0x6, 0x11, 0x7, 0x12, 0x8, 0x13, 0x9, 0x14, 0xa},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 22:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 22,
+ new int[] {0xa, 0x14, 0x8, 0x12, 0x6, 0x10, 0x4, 0xe, 0x2, 0xc},
+ new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 23:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 23,
+ new int[] {0x9, 0x12, 0x4, 0xd, 0x16, 0x8, 0x11, 0x3,
+ 0xc, 0x15, 0x7, 0x10, 0x2, 0xb, 0x14, 0x6,
+ 0xf, 0x1, 0xa, 0x13, 0x5, 0xe},
+ new int[] {0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 24:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 24,
+ new int[] {0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8,
+ 0x10},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 25:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 25,
+ new int[] {0x7, 0xe, 0x15, 0x3, 0xa, 0x11,
+ 0x18, 0x6, 0xd, 0x14, 0x2, 0x9,
+ 0x10, 0x17, 0x5, 0xc, 0x13, 0x1,
+ 0x8, 0xf, 0x16, 0x4, 0xb, 0x12},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1});
+ break;
+ case 26:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 26,
+ new int[] {0x6, 0xc, 0x12, 0x18, 0x4, 0xa, 0x10,
+ 0x16, 0x2, 0x8, 0xe, 0x14},
+ new int[] {0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2,
+ 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 27:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 27,
+ new int[] {0x5, 0xa, 0xf, 0x14, 0x19, 0x3, 0x8, 0xd, 0x12,
+ 0x17, 0x1, 0x6, 0xb, 0x10, 0x15, 0x1a, 0x4, 0x9,
+ 0xe, 0x13, 0x18, 0x2, 0x7, 0xc, 0x11, 0x16},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 28:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 28,
+ new int[] {0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c,
+ 0x4, 0x8, 0xc, 0x10, 0x14, 0x18},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 29:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 29,
+ new int[] {0x3, 0x6, 0x9, 0xc, 0xf, 0x12, 0x15, 0x18, 0x1b, 0x1,
+ 0x4, 0x7, 0xa, 0xd, 0x10, 0x13, 0x16, 0x19, 0x1c, 0x2,
+ 0x5, 0x8, 0xb, 0xe, 0x11, 0x14, 0x17, 0x1a},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 30:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 30,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10,
+ 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 31:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 31,
+ new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+ 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, 256);
+ break;
+ }
+ }
+
+ public static void fastunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ Arrays.fill(out, outpos, outpos + 256, 0);
+ break;
+ case 1:
+ fastUnpackOddBit(in, inpos, out, outpos, 1, new int[] {}, new int[] {},
+ new int[] {});
+ break;
+ case 2:
+ fastUnpackEventBit(in, inpos, out, outpos, 2, new int[] {}, new int[] {},
+ new int[] {});
+ break;
+ case 3:
+ fastUnpackOddBit(in, inpos, out, outpos, 3, new int[] {0x1, 0x2},
+ new int[] {0x1, 0x3}, new int[] {0xa, 0xa});
+ break;
+ case 4:
+ fastUnpackEventBit(in, inpos, out, outpos, 4, new int[] {0x4},
+ new int[] {0xf}, new int[] {0x7});
+ break;
+ case 5:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 5, new int[] {0x3, 0x1, 0x4, 0x2},
+ new int[] {0x7, 0x1, 0xf, 0x3}, new int[] {0x5, 0x6, 0x5, 0x6});
+ break;
+ case 6:
+ fastUnpackEventBit(in, inpos, out, outpos, 6, new int[] {0x4, 0x2},
+ new int[] {0xf, 0x3}, new int[] {0x4, 0x5});
+ break;
+ case 7:
+ fastUnpackOddBit(in, inpos, out, outpos, 7,
+ new int[] {0x3, 0x6, 0x2, 0x5, 0x1, 0x4},
+ new int[] {0x7, 0x3f, 0x3, 0x1f, 0x1, 0xf},
+ new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x4});
+ break;
+ case 8:
+ fastUnpackEventBit(in, inpos, out, outpos, 8, new int[] {0x8, 0x8, 0x8},
+ new int[] {0xff, 0xff, 0xff},
+ new int[] {0x3, 0x3, 0x3});
+ break;
+ case 9:
+ fastUnpackOddBit(in, inpos, out, outpos, 9,
+ new int[] {0x4, 0x8, 0x3, 0x7, 0x2, 0x6, 0x1, 0x5},
+ new int[] {0xf, 0xff, 0x7, 0x7f, 0x3, 0x3f, 0x1, 0x1f},
+ new int[] {0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x3});
+ break;
+ case 10:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 10, new int[] {0x8, 0x6, 0x4, 0x2},
+ new int[] {0xff, 0x3f, 0xf, 0x3}, new int[] {0x2, 0x2, 0x2, 0x3});
+ break;
+ case 11:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 11,
+ new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa},
+ new int[] {0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 12:
+ fastUnpackEventBit(in, inpos, out, outpos, 12,
+ new int[] {0x4, 0x8, 0xc, 0x4, 0x8},
+ new int[] {0xf, 0xff, 0xfff, 0xf, 0xff},
+ new int[] {0x2, 0x2, 0x1, 0x2, 0x2});
+ break;
+ case 13:
+ fastUnpackOddBit(in, inpos, out, outpos, 13,
+ new int[] {0x7, 0x1, 0x8, 0x2, 0x9, 0x3, 0xa, 0x4, 0xb,
+ 0x5, 0xc, 0x6},
+ new int[] {0x7f, 0x1, 0xff, 0x3, 0x1ff, 0x7, 0x3ff, 0xf,
+ 0x7ff, 0x1f, 0xfff, 0x3f},
+ new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x2});
+ break;
+ case 14:
+ fastUnpackEventBit(in, inpos, out, outpos, 14,
+ new int[] {0xa, 0x6, 0x2, 0xc, 0x8, 0x4},
+ new int[] {0x3ff, 0x3f, 0x3, 0xfff, 0xff, 0xf},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2});
+ break;
+ case 15:
+ fastUnpackOddBit(in, inpos, out, outpos, 15,
+ new int[] {0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1, 0xe, 0xc,
+ 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0x1fff, 0x7ff, 0x1ff, 0x7f, 0x1f, 0x7, 0x1,
+ 0x3fff, 0xfff, 0x3ff, 0xff, 0x3f, 0xf, 0x3},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x2});
+ break;
+ case 16:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 16,
+ new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+ new int[] {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 17:
+ fastUnpackOddBit(in, inpos, out, outpos, 17,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x1,
+ 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf},
+ new int[] {0x3, 0xf, 0x3f, 0xff, 0x3ff, 0xfff, 0x3fff,
+ 0xffff, 0x1, 0x7, 0x1f, 0x7f, 0x1ff, 0x7ff,
+ 0x1fff, 0x7fff},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 18:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 18,
+ new int[] {0x4, 0x8, 0xc, 0x10, 0x2, 0x6, 0xa, 0xe},
+ new int[] {0xf, 0xff, 0xfff, 0xffff, 0x3, 0x3f, 0x3ff, 0x3fff},
+ new int[] {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 19:
+ fastUnpackOddBit(in, inpos, out, outpos, 19,
+ new int[] {0x6, 0xc, 0x12, 0x5, 0xb, 0x11, 0x4, 0xa,
+ 0x10, 0x3, 0x9, 0xf, 0x2, 0x8, 0xe, 0x1, 0x7,
+ 0xd},
+ new int[] {0x3f, 0xfff, 0x3ffff, 0x1f, 0x7ff, 0x1ffff,
+ 0xf, 0x3ff, 0xffff, 0x7, 0x1ff, 0x7fff, 0x3,
+ 0xff, 0x3fff, 0x1, 0x7f, 0x1fff},
+ new int[] {0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x0,
+ 0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1});
+ break;
+ case 20:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 20,
+ new int[] {0x8, 0x10, 0x4, 0xc, 0x14, 0x8, 0x10, 0x4, 0xc},
+ new int[] {0xff, 0xffff, 0xf, 0xfff, 0xfffff, 0xff, 0xffff, 0xf,
+ 0xfff},
+ new int[] {0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1});
+ break;
+ case 21:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 21,
+ new int[] {0xa, 0x14, 0x9, 0x13, 0x8, 0x12, 0x7, 0x11, 0x6, 0x10,
+ 0x5, 0xf, 0x4, 0xe, 0x3, 0xd, 0x2, 0xc, 0x1, 0xb},
+ new int[] {0x3ff, 0xfffff, 0x1ff, 0x7ffff, 0xff, 0x3ffff, 0x7f,
+ 0x1ffff, 0x3f, 0xffff, 0x1f, 0x7fff, 0xf, 0x3fff,
+ 0x7, 0x1fff, 0x3, 0xfff, 0x1, 0x7ff},
+ new int[] {0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0,
+ 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1});
+ break;
+ case 22:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 22,
+ new int[] {0xc, 0x2, 0xe, 0x4, 0x10, 0x6, 0x12, 0x8, 0x14, 0xa},
+ new int[] {0xfff, 0x3, 0x3fff, 0xf, 0xffff, 0x3f, 0x3ffff, 0xff,
+ 0xfffff, 0x3ff},
+ new int[] {0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1});
+ break;
+ case 23:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 23,
+ new int[] {0xe, 0x5, 0x13, 0xa, 0x1, 0xf, 0x6, 0x14,
+ 0xb, 0x2, 0x10, 0x7, 0x15, 0xc, 0x3, 0x11,
+ 0x8, 0x16, 0xd, 0x4, 0x12, 0x9},
+ new int[] {0x3fff, 0x1f, 0x7ffff, 0x3ff, 0x1, 0x7fff,
+ 0x3f, 0xfffff, 0x7ff, 0x3, 0xffff, 0x7f,
+ 0x1fffff, 0xfff, 0x7, 0x1ffff, 0xff, 0x3fffff,
+ 0x1fff, 0xf, 0x3ffff, 0x1ff},
+ new int[] {0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0,
+ 0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1});
+ break;
+ case 24:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 24,
+ new int[] {0x10, 0x8, 0x18, 0x10, 0x8, 0x18, 0x10, 0x8, 0x18, 0x10,
+ 0x8},
+ new int[] {0xffff, 0xff, 0xffffff, 0xffff, 0xff, 0xffffff, 0xffff,
+ 0xff, 0xffffff, 0xffff, 0xff},
+ new int[] {0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1});
+ break;
+ case 25:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 25,
+ new int[] {0x12, 0xb, 0x4, 0x16, 0xf, 0x8, 0x1, 0x13,
+ 0xc, 0x5, 0x17, 0x10, 0x9, 0x2, 0x14, 0xd,
+ 0x6, 0x18, 0x11, 0xa, 0x3, 0x15, 0xe, 0x7},
+ new int[] {0x3ffff, 0x7ff, 0xf, 0x3fffff, 0x7fff, 0xff,
+ 0x1, 0x7ffff, 0xfff, 0x1f, 0x7fffff, 0xffff,
+ 0x1ff, 0x3, 0xfffff, 0x1fff, 0x3f, 0xffffff,
+ 0x1ffff, 0x3ff, 0x7, 0x1fffff, 0x3fff, 0x7f},
+ new int[] {0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
+ 0x1, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1});
+ break;
+ case 26:
+ fastUnpackEventBit(in, inpos, out, outpos, 26,
+ new int[] {0x14, 0xe, 0x8, 0x2, 0x16, 0x10, 0xa, 0x4,
+ 0x18, 0x12, 0xc, 0x6},
+ new int[] {0xfffff, 0x3fff, 0xff, 0x3, 0x3fffff,
+ 0xffff, 0x3ff, 0xf, 0xffffff, 0x3ffff,
+ 0xfff, 0x3f},
+ new int[] {0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1});
+ break;
+ case 27:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 27,
+ new int[] {0x16, 0x11, 0xc, 0x7, 0x2, 0x18, 0x13, 0xe, 0x9,
+ 0x4, 0x1a, 0x15, 0x10, 0xb, 0x6, 0x1, 0x17, 0x12,
+ 0xd, 0x8, 0x3, 0x19, 0x14, 0xf, 0xa, 0x5},
+ new int[] {0x3fffff, 0x1ffff, 0xfff, 0x7f, 0x3, 0xffffff,
+ 0x7ffff, 0x3fff, 0x1ff, 0xf, 0x3ffffff, 0x1fffff,
+ 0xffff, 0x7ff, 0x3f, 0x1, 0x7fffff, 0x3ffff,
+ 0x1fff, 0xff, 0x7, 0x1ffffff, 0xfffff, 0x7fff,
+ 0x3ff, 0x1f},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0,
+ 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 28:
+ fastUnpackEventBit(in, inpos, out, outpos, 28,
+ new int[] {0x18, 0x14, 0x10, 0xc, 0x8, 0x4, 0x1c, 0x18,
+ 0x14, 0x10, 0xc, 0x8, 0x4},
+ new int[] {0xffffff, 0xfffff, 0xffff, 0xfff, 0xff, 0xf,
+ 0xfffffff, 0xffffff, 0xfffff, 0xffff, 0xfff,
+ 0xff, 0xf},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 29:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 29,
+ new int[] {0x1a, 0x17, 0x14, 0x11, 0xe, 0xb, 0x8, 0x5, 0x2, 0x1c,
+ 0x19, 0x16, 0x13, 0x10, 0xd, 0xa, 0x7, 0x4, 0x1, 0x1b,
+ 0x18, 0x15, 0x12, 0xf, 0xc, 0x9, 0x6, 0x3},
+ new int[] {0x3ffffff, 0x7fffff, 0xfffff, 0x1ffff, 0x3fff,
+ 0x7ff, 0xff, 0x1f, 0x3, 0xfffffff,
+ 0x1ffffff, 0x3fffff, 0x7ffff, 0xffff, 0x1fff,
+ 0x3ff, 0x7f, 0xf, 0x1, 0x7ffffff,
+ 0xffffff, 0x1fffff, 0x3ffff, 0x7fff, 0xfff,
+ 0x1ff, 0x3f, 0x7},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 30:
+ fastUnpackEventBit(in, inpos, out, outpos, 30,
+ new int[] {0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10,
+ 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0xfffffff, 0x3ffffff, 0xffffff, 0x3fffff,
+ 0xfffff, 0x3ffff, 0xffff, 0x3fff, 0xfff,
+ 0x3ff, 0xff, 0x3f, 0xf, 0x3},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 31:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 31,
+ new int[] {0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15,
+ 0x14, 0x13, 0x12, 0x11, 0x10, 0xf, 0xe, 0xd, 0xc, 0xb,
+ 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+ new int[] {0x3fffffff, 0x1fffffff, 0xfffffff, 0x7ffffff, 0x3ffffff,
+ 0x1ffffff, 0xffffff, 0x7fffff, 0x3fffff, 0x1fffff,
+ 0xfffff, 0x7ffff, 0x3ffff, 0x1ffff, 0xffff,
+ 0x7fff, 0x3fff, 0x1fff, 0xfff, 0x7ff,
+ 0x3ff, 0x1ff, 0xff, 0x7f, 0x3f,
+ 0x1f, 0xf, 0x7, 0x3, 0x1},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, 256);
+ break;
+ }
+ }
+}
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java
new file mode 100644
index 0000000..7374fa5
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java
@@ -0,0 +1,366 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ * (c) Intel Corp. (for Vector implementation)
+ */
+package me.lemire.integercompression.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import me.lemire.integercompression.IntegerCODEC;
+import me.lemire.integercompression.SkippableIntegerCODEC;
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * This is a patching scheme designed for speed.
+ * It encodes integers in blocks of integers within pages of
+ * up to 65536 integers. Note that it is important, to get good
+ * compression and good performance, to use sizeable arrays (greater than 1024
+ * integers). For arrays containing a number of integers that is not divisible
+ * by BLOCK_SIZE, you should use it in conjunction with another CODEC:
+ *
+ * IntegerCODEC ic = new Composition(new VectorFastPFOR(), new VariableByte()).
+ *