* Adapted by D. Lemire from the Apache Lucene project.
*
+ *
* @author Daniel Lemire
*/
public final class S9 {
- /**
- * Estimate size of the compressed output.
- *
- * @param in
- * array to compress
- * @param currentPos
- * where to start reading
- * @param inlength
- * how many integers to read
- * @return estimated size of the output (in 32-bit integers)
- */
- public static int estimatecompress(int[] in, int currentPos,
- int inlength) {
- int tmpoutpos = 0;
- int finalpos = currentPos + inlength;
- outer: while (currentPos < finalpos) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int compressedNum = codeNum[selector];
- if (finalpos <= currentPos + compressedNum - 1)
- compressedNum = finalpos - currentPos;
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++)
- if (max <= in[currentPos + i])
- continue mainloop;
- currentPos += compressedNum;
- ++tmpoutpos;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- tmpoutpos++;
- currentPos++;
- }
- return tmpoutpos;
+ /**
+ * Estimate size of the compressed output.
+ *
+ * @param in
+ * array to compress
+ * @param currentPos
+ * where to start reading
+ * @param inlength
+ * how many integers to read
+ * @return estimated size of the output (in 32-bit integers)
+ */
+ public static int estimatecompress(int[] in, int currentPos, int inlength) {
+ int tmpoutpos = 0;
+ int finalpos = currentPos + inlength;
+ outer: while (currentPos < finalpos) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
+
+ int compressedNum = codeNum[selector];
+ if (finalpos <= currentPos + compressedNum - 1)
+ compressedNum = finalpos - currentPos;
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++)
+ if (Util.smallerorequalthan(max , in[currentPos + i]))
+ continue mainloop;
+ currentPos += compressedNum;
+ ++tmpoutpos;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ tmpoutpos++;
+ currentPos++;
+
}
+ return tmpoutpos;
+ }
- /**
- * Compress an integer array using Simple9
- *
- *
- * @param in
- * array to compress
- * @param currentPos
- * where to start reading
- * @param inlength
- * how many integers to read
- * @param out output array
- * @param tmpoutpos location in the output array
- * @return the number of 32-bit words written (in compressed form)
- */
- public static int compress(int[] in, int currentPos, int inlength,
- int out[], int tmpoutpos) {
- int origtmpoutpos = tmpoutpos;
- int finalpos = currentPos + inlength;
- outer: while (currentPos < finalpos) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int res = 0;
- int compressedNum = codeNum[selector];
- if (finalpos <= currentPos + compressedNum - 1)
- compressedNum = finalpos - currentPos;
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++) {
- if (max <= in[currentPos + i])
- continue mainloop;
- res = (res << b) + in[currentPos + i];
- }
- if (compressedNum != codeNum[selector])
- res <<= (codeNum[selector] - compressedNum)
- * b;
- res |= selector << 28;
- out[tmpoutpos++] = res;
- currentPos += compressedNum;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+ /**
+ * Compress an integer array using Simple9
+ *
+ *
+ * @param in
+ * array to compress
+ * @param currentPos
+ * where to start reading
+ * @param inlength
+ * how many integers to read
+ * @param out
+ * output array
+ * @param tmpoutpos
+ * location in the output array
+ * @return the number of 32-bit words written (in compressed form)
+ */
+ public static int compress(int[] in, int currentPos, int inlength, int out[], int tmpoutpos) {
+ int origtmpoutpos = tmpoutpos;
+ int finalpos = currentPos + inlength;
+ outer: while (currentPos < finalpos) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
+ int res = 0;
+ int compressedNum = codeNum[selector];
+ if (finalpos <= currentPos + compressedNum - 1)
+ compressedNum = finalpos - currentPos;
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++) {
+ if (Util.smallerorequalthan(max, in[currentPos + i]))
+ continue mainloop;
+ res = (res << b) + in[currentPos + i];
}
- return tmpoutpos - origtmpoutpos;
+ if (compressedNum != codeNum[selector])
+ res <<= (codeNum[selector] - compressedNum) * b;
+ res |= selector << 28;
+ out[tmpoutpos++] = res;
+ currentPos += compressedNum;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ out[tmpoutpos++] = in[currentPos++] | (selector << 28);
}
+ return tmpoutpos - origtmpoutpos;
+ }
- /**
- * Uncompressed data from an input array into an output array
- *
- * @param in input array (in compressed form)
- * @param tmpinpos starting location in the compressed input array
- * @param inlength how much data we wish the read (in 32-bit words)
- * @param out output array (in decompressed form)
- * @param currentPos current position in the output array
- * @param outlength available data in the output array
- */
- public static void uncompress(int[] in, int tmpinpos, int inlength,
- int[] out, int currentPos, int outlength) {
- int finallength = currentPos + outlength;
+ /**
+ * Uncompressed data from an input array into an output array
+ *
+ * @param in
+ * input array (in compressed form)
+ * @param tmpinpos
+ * starting location in the compressed input array
+ * @param inlength
+ * how much data we wish the read (in 32-bit words)
+ * @param out
+ * output array (in decompressed form)
+ * @param currentPos
+ * current position in the output array
+ * @param outlength
+ * available data in the output array
+ */
+ public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
+ int finallength = currentPos + outlength;
- while (currentPos < finallength) {
- int val = in[tmpinpos++];
- int header = val >>> 28;
- switch (header) {
- case 0: { // number : 28, bitwidth : 1
- final int howmany = finallength - currentPos < 28 ? finallength
- - currentPos
- : 28;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (k + 4)) >>> 31;
- }
- break;
- }
- case 1: { // number : 14, bitwidth : 2
- final int howmany = finallength - currentPos < 14 ? finallength
- - currentPos
- : 14;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (2 * k + 4)) >>> 30;
- }
- break;
- }
- case 2: { // number : 9, bitwidth : 3
- final int howmany = finallength - currentPos < 9 ? finallength
- - currentPos
- : 9;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (3 * k + 5)) >>> 29;
- }
- break;
- }
- case 3: { // number : 7, bitwidth : 4
- final int howmany = finallength - currentPos < 7 ? finallength
- - currentPos
- : 7;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (4 * k + 4)) >>> 28;
- }
- break;
- }
- case 4: { // number : 5, bitwidth : 5
- final int howmany = finallength - currentPos < 5 ? finallength
- - currentPos
- : 5;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (5 * k + 7)) >>> 27;
- }
- break;
- }
- case 5: { // number : 4, bitwidth : 7
- final int howmany = finallength - currentPos < 4 ? finallength
- - currentPos
- : 4;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (7 * k + 4)) >>> 25;
- }
- break;
- }
- case 6: { // number : 3, bitwidth : 9
- final int howmany = finallength - currentPos < 3 ? finallength
- - currentPos
- : 3;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (9 * k + 5)) >>> 23;
- }
- break;
- }
- case 7: { // number : 2, bitwidth : 14
- final int howmany = finallength - currentPos < 2 ? finallength
- - currentPos
- : 2;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (14 * k + 4)) >>> 18;
- }
- break;
- }
- case 8: { // number : 1, bitwidth : 28
- out[currentPos++] = (val << 4) >>> 4;
- break;
- }
- default: {
- throw new RuntimeException("shouldn't happen");
- }
- }
+ while (currentPos < finallength) {
+ int val = in[tmpinpos++];
+ int header = val >>> 28;
+ switch (header) {
+ case 0: { // number : 28, bitwidth : 1
+ final int howmany = finallength - currentPos < 28 ? finallength - currentPos : 28;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (k + 4)) >>> 31;
}
-
+ break;
+ }
+ case 1: { // number : 14, bitwidth : 2
+ final int howmany = finallength - currentPos < 14 ? finallength - currentPos : 14;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+ }
+ break;
+ }
+ case 2: { // number : 9, bitwidth : 3
+ final int howmany = finallength - currentPos < 9 ? finallength - currentPos : 9;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+ }
+ break;
+ }
+ case 3: { // number : 7, bitwidth : 4
+ final int howmany = finallength - currentPos < 7 ? finallength - currentPos : 7;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+ }
+ break;
+ }
+ case 4: { // number : 5, bitwidth : 5
+ final int howmany = finallength - currentPos < 5 ? finallength - currentPos : 5;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+ }
+ break;
+ }
+ case 5: { // number : 4, bitwidth : 7
+ final int howmany = finallength - currentPos < 4 ? finallength - currentPos : 4;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+ }
+ break;
+ }
+ case 6: { // number : 3, bitwidth : 9
+ final int howmany = finallength - currentPos < 3 ? finallength - currentPos : 3;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+ }
+ break;
+ }
+ case 7: { // number : 2, bitwidth : 14
+ final int howmany = finallength - currentPos < 2 ? finallength - currentPos : 2;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+ }
+ break;
+ }
+ case 8: { // number : 1, bitwidth : 28
+ out[currentPos++] = (val << 4) >>> 4;
+ break;
+ }
+ default: {
+ throw new RuntimeException("shouldn't happen");
+ }
+ }
}
- private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+ }
+
+ private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
- private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+ private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
}
diff --git a/src/main/java/me/lemire/integercompression/Simple16.java b/src/main/java/me/lemire/integercompression/Simple16.java
index 9562c3a..2b7f27f 100644
--- a/src/main/java/me/lemire/integercompression/Simple16.java
+++ b/src/main/java/me/lemire/integercompression/Simple16.java
@@ -1,8 +1,5 @@
package me.lemire.integercompression;
-
-
-
/**
* This is an implementation of the popular Simple16 scheme. It is limited to
* 28-bit integers (between 0 and 2^28-1).
@@ -14,10 +11,9 @@
* Adapted by D. Lemire from the Apache Lucene project.
*
*/
-public final class Simple16 implements IntegerCODEC,SkippableIntegerCODEC {
+public final class Simple16 implements IntegerCODEC, SkippableIntegerCODEC {
- public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[],
- IntWrapper outpos) {
+ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
int i_inpos = inpos.get();
int i_outpos = outpos.get();
final int finalin = i_inpos + inlength;
@@ -31,7 +27,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[]
inpos.set(i_inpos);
outpos.set(i_outpos);
}
-
+
/**
* Compress an integer array using Simple16
*
@@ -47,15 +43,13 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[]
* the number of elements to be compressed
* @return the number of compressed integers
*/
- public static final int compressblock(int[] out, int outOffset, int[] in,
- int inOffset, int n) {
+ public static final int compressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
int numIdx, j, num, bits;
for (numIdx = 0; numIdx < S16_NUMSIZE; numIdx++) {
out[outOffset] = numIdx << S16_BITSSIZE;
num = (S16_NUM[numIdx] < n) ? S16_NUM[numIdx] : n;
- for (j = 0, bits = 0; (j < num)
- && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
+ for (j = 0, bits = 0; (j < num) && (in[inOffset + j] < SHIFTED_S16_BITS[numIdx][j]);) {
out[outOffset] |= (in[inOffset + j] << bits);
bits += S16_BITS[numIdx][j];
j++;
@@ -69,7 +63,6 @@ public static final int compressblock(int[] out, int outOffset, int[] in,
return -1;
}
-
/**
* Decompress an integer array using Simple16
*
@@ -85,23 +78,19 @@ public static final int compressblock(int[] out, int outOffset, int[] in,
* the number of elements to be compressed
* @return the number of processed integers
*/
- public static final int decompressblock(int[] out, int outOffset, int[] in,
- int inOffset, int n) {
+ public static final int decompressblock(int[] out, int outOffset, int[] in, int inOffset, int n) {
int numIdx, j = 0, bits = 0;
numIdx = in[inOffset] >>> S16_BITSSIZE;
int num = S16_NUM[numIdx] < n ? S16_NUM[numIdx] : n;
for (j = 0, bits = 0; j < num; j++) {
- out[outOffset + j] = (in[inOffset] >>> bits)
- & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
+ out[outOffset + j] = (in[inOffset] >>> bits) & (0xffffffff >>> (32 - S16_BITS[numIdx][j]));
bits += S16_BITS[numIdx][j];
}
return num;
}
-
@Override
- public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos,int num) {
+ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
int i_inpos = inpos.get();
int i_outpos = outpos.get();
while (num > 0) {
@@ -114,6 +103,12 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
outpos.set(i_outpos);
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ compressedPositions.add(inlength);
+ return inlength;
+ }
+
/**
* Uncompress data from an array to another array.
*
@@ -133,12 +128,10 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
* @param outlength
* number of integers we want to decode
*/
- public static void uncompress(int[] in, int tmpinpos, int inlength,
- int[] out, int currentPos, int outlength) {
+ public static void uncompress(int[] in, int tmpinpos, int inlength, int[] out, int currentPos, int outlength) {
final int finalpos = tmpinpos + inlength;
while (tmpinpos < finalpos) {
- final int howmany = decompressblock(out, currentPos, in, tmpinpos,
- outlength);
+ final int howmany = decompressblock(out, currentPos, in, tmpinpos, outlength);
outlength -= howmany;
currentPos += howmany;
tmpinpos += 1;
@@ -155,20 +148,18 @@ private static int[][] shiftme(int[][] x) {
}
return answer;
}
-
+
@Override
- public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
+ public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
if (inlength == 0)
- return;
+ return;
out[outpos.get()] = inlength;
outpos.increment();
- headlessCompress(in, inpos, inlength, out, outpos);
+ headlessCompress(in, inpos, inlength, out, outpos);
}
@Override
- public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
+ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
if (inlength == 0)
return;
final int outlength = in[inpos.get()];
@@ -176,28 +167,25 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
headlessUncompress(in, inpos, inlength, out, outpos, outlength);
}
+
@Override
public String toString() {
- return this.getClass().getSimpleName();
+ return this.getClass().getSimpleName();
}
private static final int S16_NUMSIZE = 16;
private static final int S16_BITSSIZE = 28;
// the possible number of bits used to represent one integer
- private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6,
- 5, 5, 4, 3, 2, 1 };
+ private static final int[] S16_NUM = { 28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1 };
// the corresponding number of elements for each value of the number of bits
private static final int[][] S16_BITS = {
- { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
{ 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
{ 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1 },
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2 },
- { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
- { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
- { 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 },
- { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 4, 3, 3, 3, 3, 3, 3, 3, 3 }, { 3, 4, 4, 4, 4, 3, 3, 3 },
+ { 4, 4, 4, 4, 4, 4, 4 }, { 5, 5, 5, 5, 4, 4 }, { 4, 4, 5, 5, 5, 5 }, { 6, 6, 6, 5, 5 }, { 5, 5, 6, 6, 6 },
{ 7, 7, 7, 7 }, { 10, 9, 9, }, { 14, 14 }, { 28 } };
private static final int[][] SHIFTED_S16_BITS = shiftme(S16_BITS);
-}
\ No newline at end of file
+}
diff --git a/src/main/java/me/lemire/integercompression/Simple9.java b/src/main/java/me/lemire/integercompression/Simple9.java
index 5703b04..fd5194d 100644
--- a/src/main/java/me/lemire/integercompression/Simple9.java
+++ b/src/main/java/me/lemire/integercompression/Simple9.java
@@ -7,10 +7,9 @@
package me.lemire.integercompression;
-
/**
- * This is an implementation of the popular Simple9 scheme.
- * It is limited to 28-bit integers (between 0 and 2^28-1).
+ * This is an implementation of the popular Simple9 scheme. It is limited to
+ * 28-bit integers (between 0 and 2^28-1).
*
* Note that this does not use differential coding: if you are working on sorted
* lists, you must compute the deltas separately.
@@ -19,296 +18,288 @@
*
*/
public final class Simple9 implements IntegerCODEC, SkippableIntegerCODEC {
- @Override
- public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
- int out[], IntWrapper outpos) {
- int tmpoutpos = outpos.get();
- int currentPos = inpos.get();
- final int finalin = currentPos + inlength;
- outer: while (currentPos < finalin - 28) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int res = 0;
- int compressedNum = codeNum[selector];
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++) {
- if (max <= in[currentPos + i])
- continue mainloop;
- res = (res << b) + in[currentPos + i];
- }
- res |= selector << 28;
- out[tmpoutpos++] = res;
- currentPos += compressedNum;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- out[tmpoutpos++] = in[currentPos++] | (selector << 28);
- }
- outer: while (currentPos < finalin) {
- mainloop: for (int selector = 0; selector < 8; selector++) {
- int res = 0;
- int compressedNum = codeNum[selector];
- if (finalin <= currentPos + compressedNum - 1)
- compressedNum = finalin - currentPos;
- int b = bitLength[selector];
- int max = 1 << b;
- int i = 0;
- for (; i < compressedNum; i++) {
- if (max <= in[currentPos + i])
- continue mainloop;
- res = (res << b) + in[currentPos + i];
- }
- if (compressedNum != codeNum[selector])
- res <<= (codeNum[selector] - compressedNum)
- * b;
- res |= selector << 28;
- out[tmpoutpos++] = res;
- currentPos += compressedNum;
- continue outer;
- }
- final int selector = 8;
- if (in[currentPos] >= 1 << bitLength[selector])
- throw new RuntimeException("Too big a number");
- out[tmpoutpos++] = in[currentPos++] | (selector << 28);
- }
- inpos.set(currentPos);
- outpos.set(tmpoutpos);
- }
+ @Override
+ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int out[], IntWrapper outpos) {
+ int tmpoutpos = outpos.get();
+ int currentPos = inpos.get();
+ final int finalin = currentPos + inlength;
+ outer: while (currentPos < finalin - 28) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
- @Override
- public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
- int[] out, IntWrapper outpos, int outlength) {
- int currentPos = outpos.get();
- int tmpinpos = inpos.get();
- final int finalout = currentPos + outlength;
- while (currentPos < finalout - 28) {
- int val = in[tmpinpos++];
- int header = val >>> 28;
- switch (header) {
- case 0: { // number : 28, bitwidth : 1
- out[currentPos++] = (val << 4) >>> 31;
- out[currentPos++] = (val << 5) >>> 31;
- out[currentPos++] = (val << 6) >>> 31;
- out[currentPos++] = (val << 7) >>> 31;
- out[currentPos++] = (val << 8) >>> 31;
- out[currentPos++] = (val << 9) >>> 31;
- out[currentPos++] = (val << 10) >>> 31;
- out[currentPos++] = (val << 11) >>> 31;
- out[currentPos++] = (val << 12) >>> 31;
- out[currentPos++] = (val << 13) >>> 31; // 10
- out[currentPos++] = (val << 14) >>> 31;
- out[currentPos++] = (val << 15) >>> 31;
- out[currentPos++] = (val << 16) >>> 31;
- out[currentPos++] = (val << 17) >>> 31;
- out[currentPos++] = (val << 18) >>> 31;
- out[currentPos++] = (val << 19) >>> 31;
- out[currentPos++] = (val << 20) >>> 31;
- out[currentPos++] = (val << 21) >>> 31;
- out[currentPos++] = (val << 22) >>> 31;
- out[currentPos++] = (val << 23) >>> 31; // 20
- out[currentPos++] = (val << 24) >>> 31;
- out[currentPos++] = (val << 25) >>> 31;
- out[currentPos++] = (val << 26) >>> 31;
- out[currentPos++] = (val << 27) >>> 31;
- out[currentPos++] = (val << 28) >>> 31;
- out[currentPos++] = (val << 29) >>> 31;
- out[currentPos++] = (val << 30) >>> 31;
- out[currentPos++] = (val << 31) >>> 31;
- break;
- }
- case 1: { // number : 14, bitwidth : 2
- out[currentPos++] = (val << 4) >>> 30;
- out[currentPos++] = (val << 6) >>> 30;
- out[currentPos++] = (val << 8) >>> 30;
- out[currentPos++] = (val << 10) >>> 30;
- out[currentPos++] = (val << 12) >>> 30;
- out[currentPos++] = (val << 14) >>> 30;
- out[currentPos++] = (val << 16) >>> 30;
- out[currentPos++] = (val << 18) >>> 30;
- out[currentPos++] = (val << 20) >>> 30;
- out[currentPos++] = (val << 22) >>> 30; // 10
- out[currentPos++] = (val << 24) >>> 30;
- out[currentPos++] = (val << 26) >>> 30;
- out[currentPos++] = (val << 28) >>> 30;
- out[currentPos++] = (val << 30) >>> 30;
- break;
- }
- case 2: { // number : 9, bitwidth : 3
- out[currentPos++] = (val << 5) >>> 29;
- out[currentPos++] = (val << 8) >>> 29;
- out[currentPos++] = (val << 11) >>> 29;
- out[currentPos++] = (val << 14) >>> 29;
- out[currentPos++] = (val << 17) >>> 29;
- out[currentPos++] = (val << 20) >>> 29;
- out[currentPos++] = (val << 23) >>> 29;
- out[currentPos++] = (val << 26) >>> 29;
- out[currentPos++] = (val << 29) >>> 29;
- break;
- }
- case 3: { // number : 7, bitwidth : 4
- out[currentPos++] = (val << 4) >>> 28;
- out[currentPos++] = (val << 8) >>> 28;
- out[currentPos++] = (val << 12) >>> 28;
- out[currentPos++] = (val << 16) >>> 28;
- out[currentPos++] = (val << 20) >>> 28;
- out[currentPos++] = (val << 24) >>> 28;
- out[currentPos++] = (val << 28) >>> 28;
- break;
- }
- case 4: { // number : 5, bitwidth : 5
- out[currentPos++] = (val << 7) >>> 27;
- out[currentPos++] = (val << 12) >>> 27;
- out[currentPos++] = (val << 17) >>> 27;
- out[currentPos++] = (val << 22) >>> 27;
- out[currentPos++] = (val << 27) >>> 27;
- break;
- }
- case 5: { // number : 4, bitwidth : 7
- out[currentPos++] = (val << 4) >>> 25;
- out[currentPos++] = (val << 11) >>> 25;
- out[currentPos++] = (val << 18) >>> 25;
- out[currentPos++] = (val << 25) >>> 25;
- break;
- }
- case 6: { // number : 3, bitwidth : 9
- out[currentPos++] = (val << 5) >>> 23;
- out[currentPos++] = (val << 14) >>> 23;
- out[currentPos++] = (val << 23) >>> 23;
- break;
- }
- case 7: { // number : 2, bitwidth : 14
- out[currentPos++] = (val << 4) >>> 18;
- out[currentPos++] = (val << 18) >>> 18;
- break;
- }
- case 8: { // number : 1, bitwidth : 28
- out[currentPos++] = (val << 4) >>> 4;
- break;
- }
- default: {
- throw new RuntimeException("shouldn't happen: limited to 28-bit integers");
- }
- }
+ int res = 0;
+ int compressedNum = codeNum[selector];
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++) {
+ if (max <= in[currentPos + i])
+ continue mainloop;
+ res = (res << b) + in[currentPos + i];
}
- while (currentPos < finalout) {
- int val = in[tmpinpos++];
- int header = val >>> 28;
- switch (header) {
- case 0: { // number : 28, bitwidth : 1
- final int howmany = finalout - currentPos;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (k + 4)) >>> 31;
- }
- break;
- }
- case 1: { // number : 14, bitwidth : 2
- final int howmany = finalout - currentPos < 14 ? finalout
- - currentPos
- : 14;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (2 * k + 4)) >>> 30;
- }
- break;
- }
- case 2: { // number : 9, bitwidth : 3
- final int howmany = finalout - currentPos < 9 ? finalout
- - currentPos
- : 9;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (3 * k + 5)) >>> 29;
- }
- break;
- }
- case 3: { // number : 7, bitwidth : 4
- final int howmany = finalout - currentPos < 7 ? finalout
- - currentPos
- : 7;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (4 * k + 4)) >>> 28;
- }
- break;
- }
- case 4: { // number : 5, bitwidth : 5
- final int howmany = finalout - currentPos < 5 ? finalout
- - currentPos
- : 5;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (5 * k + 7)) >>> 27;
- }
- break;
- }
- case 5: { // number : 4, bitwidth : 7
- final int howmany = finalout - currentPos < 4 ? finalout
- - currentPos
- : 4;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (7 * k + 4)) >>> 25;
- }
- break;
- }
- case 6: { // number : 3, bitwidth : 9
- final int howmany = finalout - currentPos < 3 ? finalout
- - currentPos
- : 3;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (9 * k + 5)) >>> 23;
- }
- break;
- }
- case 7: { // number : 2, bitwidth : 14
- final int howmany = finalout - currentPos < 2 ? finalout
- - currentPos
- : 2;
- for (int k = 0; k < howmany; ++k) {
- out[currentPos++] = (val << (14 * k + 4)) >>> 18;
- }
- break;
- }
- case 8: { // number : 1, bitwidth : 28
- out[currentPos++] = (val << 4) >>> 4;
- break;
- }
- default: {
- throw new RuntimeException("shouldn't happen");
- }
- }
+ res |= selector << 28;
+ out[tmpoutpos++] = res;
+ currentPos += compressedNum;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ out[tmpoutpos++] = in[currentPos++] | (selector << 28);
+ }
+ outer: while (currentPos < finalin) {
+ mainloop: for (int selector = 0; selector < 8; selector++) {
+ int res = 0;
+ int compressedNum = codeNum[selector];
+ if (finalin <= currentPos + compressedNum - 1)
+ compressedNum = finalin - currentPos;
+ int b = bitLength[selector];
+ int max = 1 << b;
+ int i = 0;
+ for (; i < compressedNum; i++) {
+ if (max <= in[currentPos + i])
+ continue mainloop;
+ res = (res << b) + in[currentPos + i];
}
- outpos.set(currentPos);
- inpos.set(tmpinpos);
+ if (compressedNum != codeNum[selector])
+ res <<= (codeNum[selector] - compressedNum) * b;
+ res |= selector << 28;
+ out[tmpoutpos++] = res;
+ currentPos += compressedNum;
+ continue outer;
+ }
+ final int selector = 8;
+ if (in[currentPos] >= 1 << bitLength[selector])
+ throw new RuntimeException("Too big a number");
+ out[tmpoutpos++] = in[currentPos++] | (selector << 28);
}
- @Override
- public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
- if (inlength == 0)
- return;
- out[outpos.get()] = inlength;
- outpos.increment();
- headlessCompress(in, inpos, inlength, out, outpos);
+ inpos.set(currentPos);
+ outpos.set(tmpoutpos);
+ }
+
+ @Override
+ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos,
+ int outlength) {
+ int currentPos = outpos.get();
+ int tmpinpos = inpos.get();
+ final int finalout = currentPos + outlength;
+ while (currentPos < finalout - 28) {
+ int val = in[tmpinpos++];
+ int header = val >>> 28;
+ switch (header) {
+ case 0: { // number : 28, bitwidth : 1
+ out[currentPos++] = (val << 4) >>> 31;
+ out[currentPos++] = (val << 5) >>> 31;
+ out[currentPos++] = (val << 6) >>> 31;
+ out[currentPos++] = (val << 7) >>> 31;
+ out[currentPos++] = (val << 8) >>> 31;
+ out[currentPos++] = (val << 9) >>> 31;
+ out[currentPos++] = (val << 10) >>> 31;
+ out[currentPos++] = (val << 11) >>> 31;
+ out[currentPos++] = (val << 12) >>> 31;
+ out[currentPos++] = (val << 13) >>> 31; // 10
+ out[currentPos++] = (val << 14) >>> 31;
+ out[currentPos++] = (val << 15) >>> 31;
+ out[currentPos++] = (val << 16) >>> 31;
+ out[currentPos++] = (val << 17) >>> 31;
+ out[currentPos++] = (val << 18) >>> 31;
+ out[currentPos++] = (val << 19) >>> 31;
+ out[currentPos++] = (val << 20) >>> 31;
+ out[currentPos++] = (val << 21) >>> 31;
+ out[currentPos++] = (val << 22) >>> 31;
+ out[currentPos++] = (val << 23) >>> 31; // 20
+ out[currentPos++] = (val << 24) >>> 31;
+ out[currentPos++] = (val << 25) >>> 31;
+ out[currentPos++] = (val << 26) >>> 31;
+ out[currentPos++] = (val << 27) >>> 31;
+ out[currentPos++] = (val << 28) >>> 31;
+ out[currentPos++] = (val << 29) >>> 31;
+ out[currentPos++] = (val << 30) >>> 31;
+ out[currentPos++] = (val << 31) >>> 31;
+ break;
+ }
+ case 1: { // number : 14, bitwidth : 2
+ out[currentPos++] = (val << 4) >>> 30;
+ out[currentPos++] = (val << 6) >>> 30;
+ out[currentPos++] = (val << 8) >>> 30;
+ out[currentPos++] = (val << 10) >>> 30;
+ out[currentPos++] = (val << 12) >>> 30;
+ out[currentPos++] = (val << 14) >>> 30;
+ out[currentPos++] = (val << 16) >>> 30;
+ out[currentPos++] = (val << 18) >>> 30;
+ out[currentPos++] = (val << 20) >>> 30;
+ out[currentPos++] = (val << 22) >>> 30; // 10
+ out[currentPos++] = (val << 24) >>> 30;
+ out[currentPos++] = (val << 26) >>> 30;
+ out[currentPos++] = (val << 28) >>> 30;
+ out[currentPos++] = (val << 30) >>> 30;
+ break;
+ }
+ case 2: { // number : 9, bitwidth : 3
+ out[currentPos++] = (val << 5) >>> 29;
+ out[currentPos++] = (val << 8) >>> 29;
+ out[currentPos++] = (val << 11) >>> 29;
+ out[currentPos++] = (val << 14) >>> 29;
+ out[currentPos++] = (val << 17) >>> 29;
+ out[currentPos++] = (val << 20) >>> 29;
+ out[currentPos++] = (val << 23) >>> 29;
+ out[currentPos++] = (val << 26) >>> 29;
+ out[currentPos++] = (val << 29) >>> 29;
+ break;
+ }
+ case 3: { // number : 7, bitwidth : 4
+ out[currentPos++] = (val << 4) >>> 28;
+ out[currentPos++] = (val << 8) >>> 28;
+ out[currentPos++] = (val << 12) >>> 28;
+ out[currentPos++] = (val << 16) >>> 28;
+ out[currentPos++] = (val << 20) >>> 28;
+ out[currentPos++] = (val << 24) >>> 28;
+ out[currentPos++] = (val << 28) >>> 28;
+ break;
+ }
+ case 4: { // number : 5, bitwidth : 5
+ out[currentPos++] = (val << 7) >>> 27;
+ out[currentPos++] = (val << 12) >>> 27;
+ out[currentPos++] = (val << 17) >>> 27;
+ out[currentPos++] = (val << 22) >>> 27;
+ out[currentPos++] = (val << 27) >>> 27;
+ break;
+ }
+ case 5: { // number : 4, bitwidth : 7
+ out[currentPos++] = (val << 4) >>> 25;
+ out[currentPos++] = (val << 11) >>> 25;
+ out[currentPos++] = (val << 18) >>> 25;
+ out[currentPos++] = (val << 25) >>> 25;
+ break;
+ }
+ case 6: { // number : 3, bitwidth : 9
+ out[currentPos++] = (val << 5) >>> 23;
+ out[currentPos++] = (val << 14) >>> 23;
+ out[currentPos++] = (val << 23) >>> 23;
+ break;
+ }
+ case 7: { // number : 2, bitwidth : 14
+ out[currentPos++] = (val << 4) >>> 18;
+ out[currentPos++] = (val << 18) >>> 18;
+ break;
+ }
+ case 8: { // number : 1, bitwidth : 28
+ out[currentPos++] = (val << 4) >>> 4;
+ break;
+ }
+ default: {
+ throw new RuntimeException("shouldn't happen: limited to 28-bit integers");
+ }
+ }
}
+ while (currentPos < finalout) {
+ int val = in[tmpinpos++];
+ int header = val >>> 28;
+ switch (header) {
+ case 0: { // number : 28, bitwidth : 1
+ final int howmany = finalout - currentPos;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (k + 4)) >>> 31;
+ }
+ break;
+ }
+ case 1: { // number : 14, bitwidth : 2
+ final int howmany = finalout - currentPos < 14 ? finalout - currentPos : 14;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (2 * k + 4)) >>> 30;
+ }
+ break;
+ }
+ case 2: { // number : 9, bitwidth : 3
+ final int howmany = finalout - currentPos < 9 ? finalout - currentPos : 9;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (3 * k + 5)) >>> 29;
+ }
+ break;
+ }
+ case 3: { // number : 7, bitwidth : 4
+ final int howmany = finalout - currentPos < 7 ? finalout - currentPos : 7;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (4 * k + 4)) >>> 28;
+ }
+ break;
+ }
+ case 4: { // number : 5, bitwidth : 5
+ final int howmany = finalout - currentPos < 5 ? finalout - currentPos : 5;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (5 * k + 7)) >>> 27;
+ }
+ break;
+ }
+ case 5: { // number : 4, bitwidth : 7
+ final int howmany = finalout - currentPos < 4 ? finalout - currentPos : 4;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (7 * k + 4)) >>> 25;
+ }
+ break;
+ }
+ case 6: { // number : 3, bitwidth : 9
+ final int howmany = finalout - currentPos < 3 ? finalout - currentPos : 3;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (9 * k + 5)) >>> 23;
+ }
+ break;
+ }
+ case 7: { // number : 2, bitwidth : 14
+ final int howmany = finalout - currentPos < 2 ? finalout - currentPos : 2;
+ for (int k = 0; k < howmany; ++k) {
+ out[currentPos++] = (val << (14 * k + 4)) >>> 18;
+ }
+ break;
+ }
+ case 8: { // number : 1, bitwidth : 28
+ out[currentPos++] = (val << 4) >>> 4;
+ break;
+ }
+ default: {
+ throw new RuntimeException("shouldn't happen");
+ }
+ }
+ }
+ outpos.set(currentPos);
+ inpos.set(tmpinpos);
- @Override
- public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
- IntWrapper outpos) {
- if (inlength == 0)
- return;
- final int outlength = in[inpos.get()];
- inpos.increment();
- headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+ }
- }
- private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ compressedPositions.add(inlength);
+ return inlength;
+ }
- private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+ @Override
+ public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+ if (inlength == 0)
+ return;
+ out[outpos.get()] = inlength;
+ outpos.increment();
+ headlessCompress(in, inpos, inlength, out, outpos);
+ }
- @Override
- public String toString() {
- return this.getClass().getSimpleName();
- }
+ @Override
+ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
+ if (inlength == 0)
+ return;
+ final int outlength = in[inpos.get()];
+ inpos.increment();
+ headlessUncompress(in, inpos, inlength, out, outpos, outlength);
+
+ }
+
+ private final static int bitLength[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+
+ private final static int codeNum[] = { 28, 14, 9, 7, 5, 4, 3, 2, 1 };
+
+ @Override
+ public String toString() {
+ return this.getClass().getSimpleName();
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/SkippableComposition.java b/src/main/java/me/lemire/integercompression/SkippableComposition.java
index ed0f0de..fc3c18e 100644
--- a/src/main/java/me/lemire/integercompression/SkippableComposition.java
+++ b/src/main/java/me/lemire/integercompression/SkippableComposition.java
@@ -38,7 +38,12 @@ public SkippableComposition(SkippableIntegerCODEC f1,
public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos) {
int init = inpos.get();
+ int outposInit = outpos.get();
F1.headlessCompress(in, inpos, inlength, out, outpos);
+ if (outpos.get() == outposInit) {
+ out[outposInit] = 0;
+ outpos.increment();
+ }
inlength -= inpos.get() - init;
F2.headlessCompress(in, inpos, inlength, out, outpos);
}
@@ -47,12 +52,27 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos, int num) {
int init = inpos.get();
+ int outposInit = outpos.get();
+
F1.headlessUncompress(in, inpos, inlength, out, outpos, num);
+ if (inpos.get() == init) {
+ inpos.increment();
+ }
inlength -= inpos.get() - init;
- num -= outpos.get();
+ num -= outpos.get() - outposInit;
F2.headlessUncompress(in, inpos, inlength, out, outpos, num);
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int init = compressedPositions.get();
+ int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+ maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+ inlength -= compressedPositions.get() - init;
+ maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+ return maxLength;
+ }
+
@Override
public String toString() {
return F1.toString() + "+" + F2.toString();
diff --git a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
index c10d2f0..b9bdc04 100644
--- a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java
@@ -10,10 +10,11 @@
/**
* Interface describing a standard CODEC to compress integers. This is a
- * variation on the IntegerCODEC interface meant to be used for random access.
+ * variation on the IntegerCODEC interface meant to be used for random access
+ * (i.e., given a large array, you can segment it and decode just the subarray you need).
*
- * The main difference is that we must specify the number of integers we wish to
- * decode. This information should be stored elsewhere.
+ * The main difference is that you must specify the number of integers you wish to
+ * uncompress. This information should be stored elsewhere.
*
* This interface was designed by the Terrier team for their search engine.
*
@@ -25,14 +26,17 @@ public interface SkippableIntegerCODEC {
* Compress data from an array to another array.
*
* Both inpos and outpos are modified to represent how much data was read
- * and written to if 12 ints (inlength = 12) are compressed to 3 ints, then
- * inpos will be incremented by 12 while outpos will be incremented by 3 we
+ * and written to. If 12 ints (inlength = 12) are compressed to 3 ints, then
+ * inpos will be incremented by 12 while outpos will be incremented by 3. We
* use IntWrapper to pass the values by reference.
*
+ * Implementation note: contrary to {@link IntegerCODEC#compress},
+ * this may skip writing information about the number of encoded integers.
+ *
* @param in
* input array
* @param inpos
- * location in the input array
+ * where to start reading in the array
* @param inlength
* how many integers to compress
* @param out
@@ -56,13 +60,30 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
* @param inlength
* length of the compressed data (ignored by some schemes)
* @param out
- * array where to write the compressed output
+ * array where to write the uncompressed output
* @param outpos
- * where to write the compressed output in out
+ * where to start writing the uncompressed output in out
* @param num
- * number of integers we want to decode, the actual number of integers decoded can be less
+ * number of integers we want to decode. May be less than the actual number of compressed integers
*/
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos, int num);
+ /**
+ * Compute the maximum number of integers that might be required to store
+ * the compressed form of a given input array segment, without headers.
+ *
+ * This is useful to pre-allocate the output buffer before calling
+ * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper)}.
+ *
+ *
+ * @param compressedPositions
+ * since not all schemes compress every input integer, this parameter
+ * returns how many input integers will actually be compressed.
+ * This is useful when composing multiple schemes.
+ * @param inlength
+ * number of integers to be compressed
+ * @return the maximum number of integers needed in the output array
+ */
+ int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
}
diff --git a/src/main/java/me/lemire/integercompression/Util.java b/src/main/java/me/lemire/integercompression/Util.java
index 70e46b7..63fc918 100644
--- a/src/main/java/me/lemire/integercompression/Util.java
+++ b/src/main/java/me/lemire/integercompression/Util.java
@@ -13,6 +13,14 @@
*
*/
public final class Util {
+
+
+
+ // check whether x is small than y as unsigned ints (supported by Java 8 natively);
+ protected static final boolean smallerorequalthan(int x, int y) {
+ return (x + Integer.MIN_VALUE) <= (y + Integer.MIN_VALUE);
+ }
+
/**
* Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range
* of value
diff --git a/src/main/java/me/lemire/integercompression/VariableByte.java b/src/main/java/me/lemire/integercompression/VariableByte.java
index 8e3ce12..c9b04d0 100644
--- a/src/main/java/me/lemire/integercompression/VariableByte.java
+++ b/src/main/java/me/lemire/integercompression/VariableByte.java
@@ -21,6 +21,8 @@
*/
public class VariableByte implements IntegerCODEC, ByteIntegerCODEC, SkippableIntegerCODEC {
+ private static final int MAX_BYTES_PER_INT = 5;
+
private static byte extract7bits(int i, long val) {
return (byte) ((val >> (7 * i)) & ((1 << 7) - 1));
}
@@ -39,7 +41,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
IntWrapper outpos) {
if (inlength == 0)
return;
- ByteBuffer buf = ByteBuffer.allocateDirect(inlength * 8);
+ ByteBuffer buf = makeBuffer(inlength * 8);
buf.order(ByteOrder.LITTLE_ENDIAN);
for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
final long val = in[k] & 0xFFFFFFFFL; // To be consistent with
@@ -122,8 +124,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
for (int v = 0, shift = 0; p < finalp;) {
val = in[p];
int c = (byte) (val >>> s);
+ // Shift to next byte
s += 8;
+ // Shift to next integer if s==32
p += s>>5;
+ // cycle from 31 to 0
s = s & 31;
v += ((c & 127) << shift);
if ((c & 128) == 128) {
@@ -187,8 +192,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
val = in[p];
int c = val >>> s;
+ // Shift to next byte
s += 8;
+ // Shift to next integer if s==32
p += s>>5;
+ // cycle from 31 to 0
s = s & 31;
v += ((c & 127) << shift);
if ((c & 128) == 128) {
@@ -202,4 +210,25 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
inpos.set(p + (s!=0 ? 1 : 0));
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+ int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES;
+ compressedPositions.add(inlength);
+ return maxLengthInInts;
+ }
+
+ /**
+ * Creates a new buffer of the requested size.
+ *
+ * In case you need a different way to allocate buffers, you can override this method
+ * with a custom behavior. The default implementation allocates a new Java direct
+ * {@link ByteBuffer} on each invocation.
+ *
+ * @param sizeInBytes
+ * @return
+ */
+ protected ByteBuffer makeBuffer(int sizeInBytes) {
+ return ByteBuffer.allocateDirect(sizeInBytes);
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
index 847a28a..ef4a386 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/Benchmark.java
@@ -15,6 +15,7 @@
import me.lemire.integercompression.DeltaZigzagVariableByte;
import me.lemire.integercompression.FastPFOR;
import me.lemire.integercompression.FastPFOR128;
+import me.lemire.integercompression.GroupSimple9;
import me.lemire.integercompression.IntWrapper;
import me.lemire.integercompression.IntegerCODEC;
import me.lemire.integercompression.JustCopy;
@@ -24,6 +25,7 @@
import me.lemire.integercompression.OptPFD;
import me.lemire.integercompression.OptPFDS16;
import me.lemire.integercompression.OptPFDS9;
+import me.lemire.integercompression.Simple16;
import me.lemire.integercompression.Simple9;
import me.lemire.integercompression.VariableByte;
import me.lemire.integercompression.differential.Delta;
@@ -153,7 +155,7 @@ private static void testCodec(PrintWriter csvLog, int sparsity,
+ data[k][m]
+ " found "
+ decompressBuffer[m]
- + " at " + m);
+ + " at " + m + " out of " + outpos.get());
}
}
}
@@ -306,10 +308,10 @@ private static void testByteCodec(PrintWriter csvLog, int sparsity,
public static void main(String args[]) throws FileNotFoundException {
System.out
.println("# benchmark based on the ClusterData model from:");
- System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
- System.out.println("# Index compression using 64-bit words.");
+ System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
+ System.out.println("# Index compression using 64-bit words.");
System.out
- .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
+ .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
System.out.println();
PrintWriter writer = null;
@@ -487,7 +489,6 @@ private static void test(PrintWriter csvLog, int N, int nbr, int repeat) {
int[][] data = generateTestData(cdg, N, nbr, sparsity);
System.out.println("# generating random data... ok.");
-
testCodec(csvLog, sparsity, new Composition(
new FastPFOR128(), new VariableByte()), data,
repeat, false);
@@ -635,6 +636,14 @@ private static void test(PrintWriter csvLog, int N, int nbr, int repeat) {
System.out.println();
+ testCodec(csvLog, sparsity, new Simple16(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new Simple16(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new Simple16(), data,
+ repeat, true);
+ System.out.println();
+
testCodec(csvLog, sparsity, new Simple9(), data,
repeat, false);
testCodec(csvLog, sparsity, new Simple9(), data,
@@ -643,6 +652,14 @@ private static void test(PrintWriter csvLog, int N, int nbr, int repeat) {
repeat, true);
System.out.println();
+ testCodec(csvLog, sparsity, new GroupSimple9(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new GroupSimple9(), data,
+ repeat, false);
+ testCodec(csvLog, sparsity, new GroupSimple9(), data,
+ repeat, true);
+ System.out.println();
+
{
IntegerCODEC c = new Composition(
new XorBinaryPacking(),
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java
index d9243bd..c31411d 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkOffsettedSeries.java
@@ -88,7 +88,7 @@ private static void benchmarkSine(final PrintWriter csvWriter,
final IntegerCODEC[] codecs, final int count, final int length, final int mean,
final int range, final int freq) {
String dataProp = String.format(
- "(mean=%1$d range=%2$d freq=%2$d)", mean, range, freq);
+ "(mean=%1$d range=%2$d freq=%3$d)", mean, range, freq);
int[][] data = generateSineDataChunks(0, count, length, mean,
range, freq);
benchmark(csvWriter, "Sine " + dataProp, codecs, data,
diff --git a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
index 58bbc4a..b930568 100644
--- a/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
+++ b/src/main/java/me/lemire/integercompression/benchmarktools/BenchmarkSkippable.java
@@ -83,7 +83,6 @@ private static int decompressFromSkipTable(Object c, int[] compressed,
if (num > length - uncomppos.get())
num = length - uncomppos.get();
int location = metadata[metapos++];
- // System.out.println("location = "+location);
int initvalue = metadata[metapos++];
int outputlocation = uncomppos.get();
if (location != compressedpos.get())
@@ -242,10 +241,10 @@ private static void testCodec(PrintWriter csvLog, int sparsity, Object c,
*/
public static void main(String args[]) throws FileNotFoundException {
System.out.println("# benchmark based on the ClusterData model from:");
- System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
- System.out.println("# Index compression using 64-bit words.");
+ System.out.println("# Vo Ngoc Anh and Alistair Moffat. ");
+ System.out.println("# Index compression using 64-bit words.");
System.out
- .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
+ .println("# Softw. Pract. Exper.40, 2 (February 2010), 131-147. ");
System.out.println();
PrintWriter writer = null;
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
index 0dd2a96..f50a367 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedBinaryPacking.java
@@ -7,6 +7,7 @@
package me.lemire.integercompression.differential;
+import me.lemire.integercompression.BitPacking;
import me.lemire.integercompression.IntWrapper;
import me.lemire.integercompression.Util;
@@ -48,7 +49,8 @@
public class IntegratedBinaryPacking implements IntegratedIntegerCODEC,
SkippableIntegratedIntegerCODEC {
- static final int BLOCK_SIZE = 32;
+ public static final int BLOCK_SIZE = 32;
+ private static final int MAX_BIT_WIDTH = Integer.SIZE;
@Override
public void compress(int[] in, IntWrapper inpos, int inlength, int[] out,
@@ -83,6 +85,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
int tmpoutpos = outpos.get();
+
int initoffset = initvalue.get();
initvalue.set(in[inpos.get()+inlength -1]);
int s = inpos.get();
@@ -168,4 +171,13 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
initvalue.set(initoffset);
inpos.set(tmpinpos);
}
+
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int blockCount = inlength / BLOCK_SIZE;
+ int headersSizeInInts = blockCount / Integer.BYTES + (blockCount % Integer.BYTES);
+ int blocksSizeInInts = blockCount * MAX_BIT_WIDTH;
+ compressedPositions.add(blockCount * BLOCK_SIZE);
+ return headersSizeInInts + blocksSizeInInts;
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
index 652c018..1d935c4 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedIntCompressor.java
@@ -6,7 +6,9 @@
/**
* This is a convenience class that wraps a codec to provide
- * a "friendly" API.
+ * a "friendly" API. It is useful to compress sorted integers.
+ * If your integers are not sorted (not even nearly so), please
+ * consider the IntCompressor class instead.
*
*/
public class IntegratedIntCompressor {
@@ -35,12 +37,12 @@ public IntegratedIntCompressor() {
* @return compressed array
*/
public int[] compress(int[] input) {
- int [] compressed = new int[input.length+1024];
+ int maxCompressedLength = codec.maxHeadlessCompressedLength(new IntWrapper(0), input.length);
+ int [] compressed = new int[maxCompressedLength + 1]; // +1 to store the length of the input
compressed[0] = input.length;
IntWrapper outpos = new IntWrapper(1);
IntWrapper initvalue = new IntWrapper(0);
- codec.headlessCompress(input, new IntWrapper(0),
- input.length, compressed, outpos, initvalue);
+ codec.headlessCompress(input, new IntWrapper(0), input.length, compressed, outpos, initvalue);
compressed = Arrays.copyOf(compressed,outpos.intValue());
return compressed;
}
diff --git a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
index 4352ebb..a577031 100644
--- a/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
+++ b/src/main/java/me/lemire/integercompression/differential/IntegratedVariableByte.java
@@ -24,6 +24,8 @@
public class IntegratedVariableByte implements IntegratedIntegerCODEC, IntegratedByteIntegerCODEC,
SkippableIntegratedIntegerCODEC {
+ private static final int MAX_BYTES_PER_INT = 5;
+
private static byte extract7bits(int i, long val) {
return (byte)((val >> (7 * i)) & ((1 << 7) - 1));
}
@@ -38,7 +40,7 @@ public void compress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
int initoffset = 0;
- ByteBuffer buf = ByteBuffer.allocateDirect(inlength * 8);
+ ByteBuffer buf = makeBuffer(inlength * 8);
buf.order(ByteOrder.LITTLE_ENDIAN);
for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
final long val = (in[k] - initoffset) & 0xFFFFFFFFL; // To be consistent with unsigned integers in C/C++
@@ -187,7 +189,7 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
return;
int initoffset = initvalue.get();
initvalue.set(in[inpos.get()+inlength -1]);
- ByteBuffer buf = ByteBuffer.allocateDirect(inlength * 8);
+ ByteBuffer buf = makeBuffer(inlength * 8);
buf.order(ByteOrder.LITTLE_ENDIAN);
for (int k = inpos.get(); k < inpos.get() + inlength; ++k) {
final long val = (in[k] - initoffset) & 0xFFFFFFFFL; // To be consistent with unsigned integers in C/C++
@@ -229,18 +231,22 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
int[] out, IntWrapper outpos, int num, IntWrapper initvalue) {
int s = 0;
int val = 0;
+
int p = inpos.get();
int initoffset = initvalue.get();
int tmpoutpos = outpos.get();
int finaloutpos = num + tmpoutpos;
for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
+
val = in[p];
- int c = val >>> s;
+ int c = (byte) (val >>> s);
s += 8;
p += s>>5;
s = s & 31;
v += ((c & 127) << shift);
+
if ((c & 128) == 128) {
+
out[tmpoutpos++] = (initoffset = initoffset + v);
v = 0;
shift = 0;
@@ -253,4 +259,22 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
inpos.set(p + (s!=0 ? 1 : 0));
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int maxLengthInBytes = inlength * MAX_BYTES_PER_INT;
+ int maxLengthInInts = (maxLengthInBytes + Integer.BYTES - 1) / Integer.BYTES;
+ compressedPositions.add(inlength);
+ return maxLengthInInts;
+ }
+
+ /**
+ * Creates a new buffer of the requested size.
+ *
+ * In case you need a different way to allocate buffers, you can override this method
+ * with a custom behavior. The default implementation allocates a new Java direct
+ * {@link ByteBuffer} on each invocation.
+ */
+ protected ByteBuffer makeBuffer(int sizeInBytes) {
+ return ByteBuffer.allocateDirect(sizeInBytes);
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
index 2dd79a4..4786ec5 100644
--- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
+++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedComposition.java
@@ -49,9 +49,11 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
final int init = inpos.get();
+ int outposInit = outpos.get();
+
F1.headlessCompress(in, inpos, inlength, out, outpos, initvalue);
- if (outpos.get() == 0) {
- out[0] = 0;
+ if (outpos.get() == outposInit) {
+ out[outposInit] = 0;
outpos.increment();
}
inlength -= inpos.get() - init;
@@ -64,10 +66,25 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength,
if (inlength == 0)
return;
int init = inpos.get();
+ int outposInit = outpos.get();
+
F1.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue);
+ if (inpos.get() == init) {
+ inpos.increment();
+ }
inlength -= inpos.get() - init;
- num -= outpos.get();
+
+ num -= outpos.get() - outposInit;
F2.headlessUncompress(in, inpos, inlength, out, outpos,num,initvalue);
}
+ @Override
+ public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
+ int init = compressedPositions.get();
+ int maxLength = F1.maxHeadlessCompressedLength(compressedPositions, inlength);
+ maxLength += 1; // Add +1 for the potential F2 header. Question: is this header actually needed in the headless version?
+ inlength -= compressedPositions.get() - init;
+ maxLength += F2.maxHeadlessCompressedLength(compressedPositions, inlength);
+ return maxLength;
+ }
}
diff --git a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
index 8b7fd4b..e2df754 100644
--- a/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
+++ b/src/main/java/me/lemire/integercompression/differential/SkippableIntegratedIntegerCODEC.java
@@ -71,4 +71,21 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
IntWrapper outpos, int num, IntWrapper initvalue);
+ /**
+ * Compute the maximum number of integers that might be required to store
+ * the compressed form of a given input array segment, without headers.
+ *
+ * This is useful to pre-allocate the output buffer before calling
+ * {@link #headlessCompress(int[], IntWrapper, int, int[], IntWrapper, IntWrapper)}.
+ *
+ *
+ * @param compressedPositions
+ * since not all schemes compress every input integer, this parameter
+ * returns how many input integers will actually be compressed.
+ * This is useful when composing multiple schemes.
+ * @param inlength
+ * number of integers to be compressed
+ * @return the maximum number of integers needed in the output array
+ */
+ int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength);
}
diff --git a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
index bbd386a..a50497c 100644
--- a/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
+++ b/src/main/java/me/lemire/integercompression/synth/UniformDataGenerator.java
@@ -42,7 +42,7 @@ int[] generateUniformHash(int N, int Max) {
int[] ans = new int[N];
HashSet s = new HashSet();
while (s.size() < N)
- s.add(new Integer(this.rand.nextInt(Max)));
+ s.add(this.rand.nextInt(Max));
Iterator i = s.iterator();
for (int k = 0; k < N; ++k)
ans[k] = i.next().intValue();
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java
new file mode 100644
index 0000000..9b2e1ca
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java
@@ -0,0 +1,12790 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+package me.lemire.integercompression.vector;
+
+import java.util.Arrays;
+import jdk.incubator.vector.*;
+
+/**
+ * Vectorized bitpacking routines. This class is a version of the
+ * VectorBitPackerTerse class that with less branch instructions.
+ *
+ * The code is machine generated from VectorBitPackerTerse.java using helper
+ * classes.
+ *
+ */
+public class VectorBitPacker {
+ private static final VectorSpecies SPECIES_512 =
+ IntVector.SPECIES_512;
+ private static final VectorSpecies SPECIES_256 =
+ IntVector.SPECIES_256;
+ private static final int VLEN_512 = 16;
+ private static final int VLEN_256 = 8;
+ private static final int BLOCK_SIZE = 256;
+
+ private static final IntVector MASK_1 =
+ IntVector.broadcast(SPECIES_256, (1 << 1) - 1);
+ private static final IntVector MASK_2 =
+ IntVector.broadcast(SPECIES_512, (1 << 2) - 1);
+ private static final IntVector MASK_3 =
+ IntVector.broadcast(SPECIES_256, (1 << 3) - 1);
+ private static final IntVector MASK_4 =
+ IntVector.broadcast(SPECIES_512, (1 << 4) - 1);
+ private static final IntVector MASK_5 =
+ IntVector.broadcast(SPECIES_256, (1 << 5) - 1);
+ private static final IntVector MASK_6 =
+ IntVector.broadcast(SPECIES_512, (1 << 6) - 1);
+ private static final IntVector MASK_7 =
+ IntVector.broadcast(SPECIES_256, (1 << 7) - 1);
+ private static final IntVector MASK_8 =
+ IntVector.broadcast(SPECIES_512, (1 << 8) - 1);
+ private static final IntVector MASK_9 =
+ IntVector.broadcast(SPECIES_256, (1 << 9) - 1);
+ private static final IntVector MASK_10 =
+ IntVector.broadcast(SPECIES_512, (1 << 10) - 1);
+ private static final IntVector MASK_11 =
+ IntVector.broadcast(SPECIES_256, (1 << 11) - 1);
+ private static final IntVector MASK_12 =
+ IntVector.broadcast(SPECIES_512, (1 << 12) - 1);
+ private static final IntVector MASK_13 =
+ IntVector.broadcast(SPECIES_256, (1 << 13) - 1);
+ private static final IntVector MASK_14 =
+ IntVector.broadcast(SPECIES_512, (1 << 14) - 1);
+ private static final IntVector MASK_15 =
+ IntVector.broadcast(SPECIES_256, (1 << 15) - 1);
+ private static final IntVector MASK_16 =
+ IntVector.broadcast(SPECIES_512, (1 << 16) - 1);
+ private static final IntVector MASK_17 =
+ IntVector.broadcast(SPECIES_256, (1 << 17) - 1);
+ private static final IntVector MASK_18 =
+ IntVector.broadcast(SPECIES_512, (1 << 18) - 1);
+ private static final IntVector MASK_19 =
+ IntVector.broadcast(SPECIES_256, (1 << 19) - 1);
+ private static final IntVector MASK_20 =
+ IntVector.broadcast(SPECIES_512, (1 << 20) - 1);
+ private static final IntVector MASK_21 =
+ IntVector.broadcast(SPECIES_256, (1 << 21) - 1);
+ private static final IntVector MASK_22 =
+ IntVector.broadcast(SPECIES_512, (1 << 22) - 1);
+ private static final IntVector MASK_23 =
+ IntVector.broadcast(SPECIES_256, (1 << 23) - 1);
+ private static final IntVector MASK_24 =
+ IntVector.broadcast(SPECIES_512, (1 << 24) - 1);
+ private static final IntVector MASK_25 =
+ IntVector.broadcast(SPECIES_256, (1 << 25) - 1);
+ private static final IntVector MASK_26 =
+ IntVector.broadcast(SPECIES_512, (1 << 26) - 1);
+ private static final IntVector MASK_27 =
+ IntVector.broadcast(SPECIES_256, (1 << 27) - 1);
+ private static final IntVector MASK_28 =
+ IntVector.broadcast(SPECIES_512, (1 << 28) - 1);
+ private static final IntVector MASK_29 =
+ IntVector.broadcast(SPECIES_256, (1 << 29) - 1);
+ private static final IntVector MASK_30 =
+ IntVector.broadcast(SPECIES_512, (1 << 30) - 1);
+ private static final IntVector MASK_31 =
+ IntVector.broadcast(SPECIES_256, (1 << 31) - 1);
+
+ /**
+ * Pack 32 integers
+ *
+ * @param in
+ * source array
+ * @param inpos
+ * position in source array
+ * @param out
+ * output array
+ * @param outpos
+ * position in output array
+ * @param b
+ * number of bits to use per integer
+ */
+ public static void fastpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpack1(in, inpos, out, outpos);
+ break;
+ case 2:
+ fastpack2(in, inpos, out, outpos);
+ break;
+ case 3:
+ fastpack3(in, inpos, out, outpos);
+ break;
+ case 4:
+ fastpack4(in, inpos, out, outpos);
+ break;
+ case 5:
+ fastpack5(in, inpos, out, outpos);
+ break;
+ case 6:
+ fastpack6(in, inpos, out, outpos);
+ break;
+ case 7:
+ fastpack7(in, inpos, out, outpos);
+ break;
+ case 8:
+ fastpack8(in, inpos, out, outpos);
+ break;
+ case 9:
+ fastpack9(in, inpos, out, outpos);
+ break;
+ case 10:
+ fastpack10(in, inpos, out, outpos);
+ break;
+ case 11:
+ fastpack11(in, inpos, out, outpos);
+ break;
+ case 12:
+ fastpack12(in, inpos, out, outpos);
+ break;
+ case 13:
+ fastpack13(in, inpos, out, outpos);
+ break;
+ case 14:
+ fastpack14(in, inpos, out, outpos);
+ break;
+ case 15:
+ fastpack15(in, inpos, out, outpos);
+ break;
+ case 16:
+ fastpack16(in, inpos, out, outpos);
+ break;
+ case 17:
+ fastpack17(in, inpos, out, outpos);
+ break;
+ case 18:
+ fastpack18(in, inpos, out, outpos);
+ break;
+ case 19:
+ fastpack19(in, inpos, out, outpos);
+ break;
+ case 20:
+ fastpack20(in, inpos, out, outpos);
+ break;
+ case 21:
+ fastpack21(in, inpos, out, outpos);
+ break;
+ case 22:
+ fastpack22(in, inpos, out, outpos);
+ break;
+ case 23:
+ fastpack23(in, inpos, out, outpos);
+ break;
+ case 24:
+ fastpack24(in, inpos, out, outpos);
+ break;
+ case 25:
+ fastpack25(in, inpos, out, outpos);
+ break;
+ case 26:
+ fastpack26(in, inpos, out, outpos);
+ break;
+ case 27:
+ fastpack27(in, inpos, out, outpos);
+ break;
+ case 28:
+ fastpack28(in, inpos, out, outpos);
+ break;
+ case 29:
+ fastpack29(in, inpos, out, outpos);
+ break;
+ case 30:
+ fastpack30(in, inpos, out, outpos);
+ break;
+ case 31:
+ fastpack31(in, inpos, out, outpos);
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+ break;
+ }
+ }
+
+ static void fastpackNoMask(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpackNoMask1(in, inpos, out, outpos);
+ break;
+ case 2:
+ fastpackNoMask2(in, inpos, out, outpos);
+ break;
+ case 3:
+ fastpackNoMask3(in, inpos, out, outpos);
+ break;
+ case 4:
+ fastpackNoMask4(in, inpos, out, outpos);
+ break;
+ case 5:
+ fastpackNoMask5(in, inpos, out, outpos);
+ break;
+ case 6:
+ fastpackNoMask6(in, inpos, out, outpos);
+ break;
+ case 7:
+ fastpackNoMask7(in, inpos, out, outpos);
+ break;
+ case 8:
+ fastpackNoMask8(in, inpos, out, outpos);
+ break;
+ case 9:
+ fastpackNoMask9(in, inpos, out, outpos);
+ break;
+ case 10:
+ fastpackNoMask10(in, inpos, out, outpos);
+ break;
+ case 11:
+ fastpackNoMask11(in, inpos, out, outpos);
+ break;
+ case 12:
+ fastpackNoMask12(in, inpos, out, outpos);
+ break;
+ case 13:
+ fastpackNoMask13(in, inpos, out, outpos);
+ break;
+ case 14:
+ fastpackNoMask14(in, inpos, out, outpos);
+ break;
+ case 15:
+ fastpackNoMask15(in, inpos, out, outpos);
+ break;
+ case 16:
+ fastpackNoMask16(in, inpos, out, outpos);
+ break;
+ case 17:
+ fastpackNoMask17(in, inpos, out, outpos);
+ break;
+ case 18:
+ fastpackNoMask18(in, inpos, out, outpos);
+ break;
+ case 19:
+ fastpackNoMask19(in, inpos, out, outpos);
+ break;
+ case 20:
+ fastpackNoMask20(in, inpos, out, outpos);
+ break;
+ case 21:
+ fastpackNoMask21(in, inpos, out, outpos);
+ break;
+ case 22:
+ fastpackNoMask22(in, inpos, out, outpos);
+ break;
+ case 23:
+ fastpackNoMask23(in, inpos, out, outpos);
+ break;
+ case 24:
+ fastpackNoMask24(in, inpos, out, outpos);
+ break;
+ case 25:
+ fastpackNoMask25(in, inpos, out, outpos);
+ break;
+ case 26:
+ fastpackNoMask26(in, inpos, out, outpos);
+ break;
+ case 27:
+ fastpackNoMask27(in, inpos, out, outpos);
+ break;
+ case 28:
+ fastpackNoMask28(in, inpos, out, outpos);
+ break;
+ case 29:
+ fastpackNoMask29(in, inpos, out, outpos);
+ break;
+ case 30:
+ fastpackNoMask30(in, inpos, out, outpos);
+ break;
+ case 31:
+ fastpackNoMask31(in, inpos, out, outpos);
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+ break;
+ }
+ }
+
+ /**
+ * Unpack 32 integers
+ *
+ * @param in
+ * source array
+ * @param inpos
+ * position in source array
+ * @param out
+ * output array
+ * @param outpos
+ * position in output array
+ * @param b
+ * number of bits to use per integer
+ */
+ public static void fastunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ Arrays.fill(out, outpos, outpos + 256, 0);
+ break;
+ case 1:
+ fastunpack1(in, inpos, out, outpos);
+ break;
+ case 2:
+ fastunpack2(in, inpos, out, outpos);
+ break;
+ case 3:
+ fastunpack3(in, inpos, out, outpos);
+ break;
+ case 4:
+ fastunpack4(in, inpos, out, outpos);
+ break;
+ case 5:
+ fastunpack5(in, inpos, out, outpos);
+ break;
+ case 6:
+ fastunpack6(in, inpos, out, outpos);
+ break;
+ case 7:
+ fastunpack7(in, inpos, out, outpos);
+ break;
+ case 8:
+ fastunpack8(in, inpos, out, outpos);
+ break;
+ case 9:
+ fastunpack9(in, inpos, out, outpos);
+ break;
+ case 10:
+ fastunpack10(in, inpos, out, outpos);
+ break;
+ case 11:
+ fastunpack11(in, inpos, out, outpos);
+ break;
+ case 12:
+ fastunpack12(in, inpos, out, outpos);
+ break;
+ case 13:
+ fastunpack13(in, inpos, out, outpos);
+ break;
+ case 14:
+ fastunpack14(in, inpos, out, outpos);
+ break;
+ case 15:
+ fastunpack15(in, inpos, out, outpos);
+ break;
+ case 16:
+ fastunpack16(in, inpos, out, outpos);
+ break;
+ case 17:
+ fastunpack17(in, inpos, out, outpos);
+ break;
+ case 18:
+ fastunpack18(in, inpos, out, outpos);
+ break;
+ case 19:
+ fastunpack19(in, inpos, out, outpos);
+ break;
+ case 20:
+ fastunpack20(in, inpos, out, outpos);
+ break;
+ case 21:
+ fastunpack21(in, inpos, out, outpos);
+ break;
+ case 22:
+ fastunpack22(in, inpos, out, outpos);
+ break;
+ case 23:
+ fastunpack23(in, inpos, out, outpos);
+ break;
+ case 24:
+ fastunpack24(in, inpos, out, outpos);
+ break;
+ case 25:
+ fastunpack25(in, inpos, out, outpos);
+ break;
+ case 26:
+ fastunpack26(in, inpos, out, outpos);
+ break;
+ case 27:
+ fastunpack27(in, inpos, out, outpos);
+ break;
+ case 28:
+ fastunpack28(in, inpos, out, outpos);
+ break;
+ case 29:
+ fastunpack29(in, inpos, out, outpos);
+ break;
+ case 30:
+ fastunpack30(in, inpos, out, outpos);
+ break;
+ case 31:
+ fastunpack31(in, inpos, out, outpos);
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE);
+ break;
+ }
+ }
+
+ public static int slowpack(final int[] in, int inpos, int inlen,
+ final int[] out, int outpos, int b) {
+ if (inlen == 0)
+ return outpos;
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, inlen);
+ return outpos + inlen;
+ }
+ int mask = (1 << b) - 1;
+ int c = 0;
+ int l = 0;
+ int r = 0;
+ int val = 0;
+ for (int i = 0; i < inlen; i++) {
+ val = in[inpos + i] & mask;
+ out[outpos] |= val << (c + r);
+ c += b;
+ l = (32 - r) % b;
+ if (c + r >= 32) {
+ if (i < inlen - 1 || l != 0)
+ outpos++;
+ r = l == 0 ? 0 : b - l;
+ if (l != 0)
+ out[outpos] = val >> (b - r);
+ c = 0;
+ }
+ }
+ return outpos;
+ }
+
+ public static int slowunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int outlen, int b) {
+ if (outlen == 0) {
+ return inpos;
+ }
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, outlen);
+ return inpos + outlen;
+ }
+ int mask = (1 << b) - 1;
+ int limit = outpos + outlen;
+ int r = 0;
+ int val = 0;
+ int i = 0;
+ for (; outpos < limit; i++) {
+ if (r > 0)
+ out[outpos++] =
+ (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask);
+ val = in[inpos + i];
+ int j = 0;
+ int l = 32 - r;
+ int ll = l % b == 0 ? l : l - b;
+ while (j < ll && outpos < limit) {
+ out[outpos++] = (val >> (j + r)) & mask;
+ j += b;
+ }
+ r = l % b == 0 ? 0 : b - (l % b);
+ }
+ return inpos + i;
+ }
+
+ public static int numCompressedInts(int n, int b) {
+ int width = b % 2 == 0 ? VLEN_512 : VLEN_256;
+ if (n <= width)
+ return n;
+ int intsPerVec = (32 / b) * width;
+ int q = (n + intsPerVec - 1) / intsPerVec;
+ return q * width;
+ }
+
+ private static void fastpack1(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_1);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack2(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_2);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack3(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_3);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack4(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_4);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack5(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_5);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack6(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_6);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack7(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_7);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack8(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_8);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack9(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_9);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack10(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_10);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack11(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_11);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack12(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_12);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack13(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_13);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack14(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_14);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack15(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_15);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack16(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_16);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack17(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_17);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack18(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_18);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack19(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_19);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack20(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_20);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack21(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_21);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack22(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_22);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack23(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_23);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack24(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_24);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack25(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_25);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack26(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_26);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack27(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_27);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack28(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_28);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack29(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_29);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack30(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(MASK_30);
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpack31(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(MASK_31);
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask1(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask2(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask3(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask4(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask5(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask6(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask7(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask8(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask9(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask10(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask11(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask12(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask13(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask14(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask15(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask16(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask17(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask18(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask19(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask20(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask21(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask22(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask23(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask24(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask25(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask26(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask27(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask28(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.or(oV);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask29(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask30(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastpackNoMask31(final int[] in, int inpos,
+ final int[] out, int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 1);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 248);
+ oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ }
+
+ private static void fastunpack1(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack2(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack3(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack4(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.and(MASK_4);
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack5(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack6(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack7(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack8(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.and(MASK_8);
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack9(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack10(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack11(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack12(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xfff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack13(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack14(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack15(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack16(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.and(MASK_16);
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack17(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack18(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack19(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack20(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xfffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack21(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack22(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack23(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack24(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack25(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack26(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack27(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack28(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_28).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = oV.zero(SPECIES_512);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xfffffff).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack29(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+
+ private static void fastunpack30(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ iV.and(MASK_30).intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 16);
+ oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 32);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 48);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 64);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 80);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 96);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 112);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 128);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 144);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 160);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 176);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 192);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 208);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30);
+
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + 224);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+
+ private static void fastunpack31(final int[] in, int inpos, final int[] out,
+ int outpos) {
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ iV.and(MASK_31).intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ var oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 8);
+ oV = iV.and(0x3fffffff).lanewise(VectorOperators.LSHL, 1).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 16);
+ oV = iV.and(0x1fffffff).lanewise(VectorOperators.LSHL, 2).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 24);
+ oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 3).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 32);
+ oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 4).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 40);
+ oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 5).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 48);
+ oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 6).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 56);
+ oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 7).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 64);
+ oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 8).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 72);
+ oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 9).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 80);
+ oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 10).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 88);
+ oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 11).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 96);
+ oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 12).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 104);
+ oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 13).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 112);
+ oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 14).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 120);
+ oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 15).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 128);
+ oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 16).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 136);
+ oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 17).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 144);
+ oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 18).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 152);
+ oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 19).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 160);
+ oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 20).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 168);
+ oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 21).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 176);
+ oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 22).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 184);
+ oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 23).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 192);
+ oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 24).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 200);
+ oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 25).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 208);
+ oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 26).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 216);
+ oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 27).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 224);
+ oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 232);
+ oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31);
+
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + 240);
+ oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV);
+
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+}
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java
new file mode 100644
index 0000000..62a8cc7
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerTerse.java
@@ -0,0 +1,963 @@
+// Copyright (C) 2022 Intel Corporation
+
+// SPDX-License-Identifier: Apache-2.0
+
+package me.lemire.integercompression.vector;
+
+import java.util.Arrays;
+import jdk.incubator.vector.*;
+
+/**
+ * This is a readable but less efficient version of the VectorBitPacker class.
+ *
+ */
+public class VectorBitPackerTerse {
+ static final VectorSpecies SPECIES_512 = IntVector.SPECIES_512;
+ static final VectorSpecies SPECIES_256 = IntVector.SPECIES_256;
+ static final int VLEN_512 = 16;
+ static final int VLEN_256 = 8;
+ static final int BLOCK_SIZE = 256;
+
+ private static void fastpackOddBit(final int[] in, int inpos, final int[] out,
+ int outpos, int b, final int[] ho,
+ final int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 31 / b;
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV.and(mask);
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.and(mask).lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ final int L = b - 1;
+ for (int i = 0; i < L; i++) {
+ oV = iV.and(mask).lanewise(VectorOperators.LSHR, ho[i]);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.and(mask)
+ .lanewise(VectorOperators.LSHL, b * j + (b - ho[i]))
+ .or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+ }
+
+ private static void fastpackOddBitNoMask(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] ho, final int[] lc) {
+ final int N = 31 / b;
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ var oV = iV;
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+
+ final int L = b - 1;
+ for (int i = 0; i < L; i++) {
+ oV = iV.lanewise(VectorOperators.LSHR, ho[i]);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + n * VLEN_256);
+ oV = iV.lanewise(VectorOperators.LSHL, b * j + (b - ho[i])).or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+ }
+
+ private static void fastUnpackOddBit(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] lo, int[] masks, int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 32 / b;
+ var iV = IntVector.fromArray(SPECIES_256, in, inpos);
+ int n = 0;
+ for (; n < N; n++) {
+ iV.lanewise(VectorOperators.LSHR, b * n).and(mask).intoArray(out, outpos);
+ outpos += VLEN_256;
+ }
+ var oV = iV.lanewise(VectorOperators.LSHR, b * n).and(mask);
+
+ final int L = b - 1;
+ for (int i = 0; i < L; i++) {
+ iV = IntVector.fromArray(SPECIES_256, in, inpos + (i + 1) * VLEN_256);
+ oV = iV.and(masks[i]).lanewise(VectorOperators.LSHL, b - lo[i]).or(oV);
+ oV.intoArray(out, outpos);
+ outpos += VLEN_256;
+ int j = 0;
+ for (; j < lc[i]; j++) {
+ iV.lanewise(VectorOperators.LSHR, b * j + lo[i])
+ .and(mask)
+ .intoArray(out, outpos);
+ outpos += VLEN_256;
+ n++;
+ }
+ oV = iV.lanewise(VectorOperators.LSHR, b * j + lo[i]).and(mask);
+ }
+ }
+
+ private static void fastpackEvenBit(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] ho, final int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 32 % b == 0 ? (32 / b) - 1 : 32 / b;
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV.and(mask);
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.and(mask).lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ final int L = (b >>> 1) - 1;
+ for (int i = 0; i < L; i++) {
+ if (ho[i] != b)
+ oV = iV.and(mask).lanewise(VectorOperators.LSHR, ho[i]);
+ else
+ oV = oV.zero(SPECIES_512);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.and(mask)
+ .lanewise(VectorOperators.LSHL, b * j + (b - ho[i]))
+ .or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+ }
+
+ private static void fastpackEvenBitNoMask(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] ho, final int[] lc) {
+ final int N = 32 % b == 0 ? (32 / b) - 1 : 32 / b;
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ var oV = iV;
+ int n = 1;
+ for (; n <= N; n++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.lanewise(VectorOperators.LSHL, b * n).or(oV);
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+
+ final int L = (b >>> 1) - 1;
+ for (int i = 0; i < L; i++) {
+ if (ho[i] != b)
+ oV = iV.lanewise(VectorOperators.LSHR, ho[i]);
+ else
+ oV = oV.zero(SPECIES_512);
+ for (int j = 0; j < lc[i]; j++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + n * VLEN_512);
+ oV = iV.lanewise(VectorOperators.LSHL, b * j + (b - ho[i])).or(oV);
+ n++;
+ }
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+ }
+
+ private static void fastUnpackEventBit(final int[] in, int inpos,
+ final int[] out, int outpos, int b,
+ final int[] lo, int[] masks,
+ int[] lc) {
+ final int mask = (1 << b) - 1;
+ final int N = 32 / b;
+ var iV = IntVector.fromArray(SPECIES_512, in, inpos);
+ int n = 0;
+ for (; n < N; n++) {
+ iV.lanewise(VectorOperators.LSHR, b * n).and(mask).intoArray(out, outpos);
+ outpos += VLEN_512;
+ }
+ var oV = iV.lanewise(VectorOperators.LSHR, b * n).and(mask);
+ if ((b & (b - 1)) == 0)
+ oV = oV.zero(SPECIES_512);
+
+ final int L = (b >>> 1) - 1;
+ for (int i = 0; i < L; i++) {
+ iV = IntVector.fromArray(SPECIES_512, in, inpos + (i + 1) * VLEN_512);
+ oV = iV.and(masks[i]).lanewise(VectorOperators.LSHL, b - lo[i]).or(oV);
+ oV.intoArray(out, outpos);
+ outpos += VLEN_512;
+ int j = 0;
+ for (; j < lc[i]; j++) {
+ iV.lanewise(VectorOperators.LSHR, b * j + lo[i])
+ .and(mask)
+ .intoArray(out, outpos);
+ outpos += VLEN_512;
+ n++;
+ }
+ if ((32 - lo[i]) % b != 0)
+ oV = iV.lanewise(VectorOperators.LSHR, b * j + lo[i]).and(mask);
+ else
+ oV = oV.zero(SPECIES_512);
+ }
+ }
+
+ public static int slowpack(final int[] in, int inpos, int inlen,
+ final int[] out, int outpos, int b) {
+ if (inlen == 0)
+ return outpos;
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, inlen);
+ return outpos + inlen;
+ }
+ int mask = (1 << b) - 1;
+ int c = 0;
+ int l = 0;
+ int r = 0;
+ int val = 0;
+ for (int i = 0; i < inlen; i++) {
+ val = in[inpos + i] & mask;
+ out[outpos] |= val << (c + r);
+ c += b;
+ l = (32 - r) % b;
+ if (c + r >= 32) {
+ if (i < inlen - 1 || l != 0)
+ outpos++;
+ r = l == 0 ? 0 : b - l;
+ if (l != 0)
+ out[outpos] = val >> (b - r);
+ c = 0;
+ }
+ }
+ return outpos;
+ }
+
+ public static int slowunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int outlen, int b) {
+ if (outlen == 0) {
+ return inpos;
+ }
+ if (b == 32) {
+ System.arraycopy(in, inpos, out, outpos, outlen);
+ return inpos + outlen;
+ }
+ int mask = (1 << b) - 1;
+ int limit = outpos + outlen;
+ int r = 0;
+ int val = 0;
+ int i = 0;
+ for (; outpos < limit; i++) {
+ if (r > 0)
+ out[outpos++] =
+ (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask);
+ val = in[inpos + i];
+ int j = 0;
+ int l = 32 - r;
+ int ll = l % b == 0 ? l : l - b;
+ while (j < ll && outpos < limit) {
+ out[outpos++] = (val >> (j + r)) & mask;
+ j += b;
+ }
+ r = l % b == 0 ? 0 : b - (l % b);
+ }
+ return inpos + i;
+ }
+
+ public static int numCompressedInts(int n, int b) {
+ int width = b % 2 == 0 ? VLEN_512 : VLEN_256;
+ if (n <= width)
+ return n;
+ int intsPerVec = (32 / b) * width;
+ int q = (n + intsPerVec - 1) / intsPerVec;
+ return q * width;
+ }
+
+ public static void fastpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpackOddBit(in, inpos, out, outpos, 1, new int[] {}, new int[] {});
+ break;
+ case 2:
+ fastpackEvenBit(in, inpos, out, outpos, 2, new int[] {}, new int[] {});
+ break;
+ case 3:
+ fastpackOddBit(in, inpos, out, outpos, 3, new int[] {0x2, 0x1},
+ new int[] {0xb, 0xa});
+ break;
+ case 4:
+ fastpackEvenBit(in, inpos, out, outpos, 4, new int[] {0x4},
+ new int[] {0x8});
+ break;
+ case 5:
+ fastpackOddBit(in, inpos, out, outpos, 5, new int[] {0x2, 0x4, 0x1, 0x3},
+ new int[] {0x6, 0x7, 0x6, 0x6});
+ break;
+ case 6:
+ fastpackEvenBit(in, inpos, out, outpos, 6, new int[] {0x2, 0x4},
+ new int[] {0x5, 0x5});
+ break;
+ case 7:
+ fastpackOddBit(in, inpos, out, outpos, 7,
+ new int[] {0x4, 0x1, 0x5, 0x2, 0x6, 0x3},
+ new int[] {0x5, 0x4, 0x5, 0x4, 0x5, 0x4});
+ break;
+ case 8:
+ fastpackEvenBit(in, inpos, out, outpos, 8, new int[] {0x8, 0x8, 0x8},
+ new int[] {0x4, 0x4, 0x4});
+ break;
+ case 9:
+ fastpackOddBit(in, inpos, out, outpos, 9,
+ new int[] {0x5, 0x1, 0x6, 0x2, 0x7, 0x3, 0x8, 0x4},
+ new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x3, 0x4, 0x3});
+ break;
+ case 10:
+ fastpackEvenBit(in, inpos, out, outpos, 10,
+ new int[] {0x2, 0x4, 0x6, 0x8},
+ new int[] {0x3, 0x3, 0x3, 0x3});
+ break;
+ case 11:
+ fastpackOddBit(
+ in, inpos, out, outpos, 11,
+ new int[] {0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+ new int[] {0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2});
+ break;
+ case 12:
+ fastpackEvenBit(in, inpos, out, outpos, 12,
+ new int[] {0x8, 0x4, 0xc, 0x8, 0x4},
+ new int[] {0x3, 0x2, 0x3, 0x3, 0x2});
+ break;
+ case 13:
+ fastpackOddBit(in, inpos, out, outpos, 13,
+ new int[] {0x6, 0xc, 0x5, 0xb, 0x4, 0xa, 0x3, 0x9, 0x2,
+ 0x8, 0x1, 0x7},
+ new int[] {0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2,
+ 0x3, 0x2, 0x2});
+ break;
+ case 14:
+ fastpackEvenBit(in, inpos, out, outpos, 14,
+ new int[] {0x4, 0x8, 0xc, 0x2, 0x6, 0xa},
+ new int[] {0x2, 0x2, 0x3, 0x2, 0x2, 0x2});
+ break;
+ case 15:
+ fastpackOddBit(in, inpos, out, outpos, 15,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x1, 0x3,
+ 0x5, 0x7, 0x9, 0xb, 0xd},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2, 0x2,
+ 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 16:
+ fastpackEvenBit(in, inpos, out, outpos, 16,
+ new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 17:
+ fastpackOddBit(in, inpos, out, outpos, 17,
+ new int[] {0xf, 0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1, 0x10,
+ 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x2,
+ 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 18:
+ fastpackEvenBit(in, inpos, out, outpos, 18,
+ new int[] {0xe, 0xa, 0x6, 0x2, 0x10, 0xc, 0x8, 0x4},
+ new int[] {0x2, 0x2, 0x2, 0x1, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 19:
+ fastpackOddBit(in, inpos, out, outpos, 19,
+ new int[] {0xd, 0x7, 0x1, 0xe, 0x8, 0x2, 0xf, 0x9, 0x3,
+ 0x10, 0xa, 0x4, 0x11, 0xb, 0x5, 0x12, 0xc, 0x6},
+ new int[] {0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1,
+ 0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1});
+ break;
+ case 20:
+ fastpackEvenBit(
+ in, inpos, out, outpos, 20,
+ new int[] {0xc, 0x4, 0x10, 0x8, 0x14, 0xc, 0x4, 0x10, 0x8},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 21:
+ fastpackOddBit(
+ in, inpos, out, outpos, 21,
+ new int[] {0xb, 0x1, 0xc, 0x2, 0xd, 0x3, 0xe, 0x4, 0xf, 0x5,
+ 0x10, 0x6, 0x11, 0x7, 0x12, 0x8, 0x13, 0x9, 0x14, 0xa},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 22:
+ fastpackEvenBit(
+ in, inpos, out, outpos, 22,
+ new int[] {0xa, 0x14, 0x8, 0x12, 0x6, 0x10, 0x4, 0xe, 0x2, 0xc},
+ new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 23:
+ fastpackOddBit(in, inpos, out, outpos, 23,
+ new int[] {0x9, 0x12, 0x4, 0xd, 0x16, 0x8, 0x11, 0x3,
+ 0xc, 0x15, 0x7, 0x10, 0x2, 0xb, 0x14, 0x6,
+ 0xf, 0x1, 0xa, 0x13, 0x5, 0xe},
+ new int[] {0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 24:
+ fastpackEvenBit(
+ in, inpos, out, outpos, 24,
+ new int[] {0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8,
+ 0x10},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 25:
+ fastpackOddBit(in, inpos, out, outpos, 25,
+ new int[] {0x7, 0xe, 0x15, 0x3, 0xa, 0x11, 0x18, 0x6,
+ 0xd, 0x14, 0x2, 0x9, 0x10, 0x17, 0x5, 0xc,
+ 0x13, 0x1, 0x8, 0xf, 0x16, 0x4, 0xb, 0x12},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1});
+ break;
+ case 26:
+ fastpackEvenBit(in, inpos, out, outpos, 26,
+ new int[] {0x6, 0xc, 0x12, 0x18, 0x4, 0xa, 0x10, 0x16,
+ 0x2, 0x8, 0xe, 0x14},
+ new int[] {0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1});
+ break;
+ case 27:
+ fastpackOddBit(in, inpos, out, outpos, 27,
+ new int[] {0x5, 0xa, 0xf, 0x14, 0x19, 0x3, 0x8,
+ 0xd, 0x12, 0x17, 0x1, 0x6, 0xb, 0x10,
+ 0x15, 0x1a, 0x4, 0x9, 0xe, 0x13, 0x18,
+ 0x2, 0x7, 0xc, 0x11, 0x16},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 28:
+ fastpackEvenBit(in, inpos, out, outpos, 28,
+ new int[] {0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x4,
+ 0x8, 0xc, 0x10, 0x14, 0x18},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 29:
+ fastpackOddBit(
+ in, inpos, out, outpos, 29,
+ new int[] {0x3, 0x6, 0x9, 0xc, 0xf, 0x12, 0x15, 0x18, 0x1b, 0x1,
+ 0x4, 0x7, 0xa, 0xd, 0x10, 0x13, 0x16, 0x19, 0x1c, 0x2,
+ 0x5, 0x8, 0xb, 0xe, 0x11, 0x14, 0x17, 0x1a},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 30:
+ fastpackEvenBit(in, inpos, out, outpos, 30,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12,
+ 0x14, 0x16, 0x18, 0x1a, 0x1c},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 31:
+ fastpackOddBit(in, inpos, out, outpos, 31,
+ new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+ 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, 256);
+ break;
+ }
+ }
+
+ public static void fastpackNoMask(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ break;
+ case 1:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 1, new int[] {},
+ new int[] {});
+ break;
+ case 2:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 2, new int[] {},
+ new int[] {});
+ break;
+ case 3:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 3, new int[] {0x2, 0x1},
+ new int[] {0xb, 0xa});
+ break;
+ case 4:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 4, new int[] {0x4},
+ new int[] {0x8});
+ break;
+ case 5:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 5,
+ new int[] {0x2, 0x4, 0x1, 0x3},
+ new int[] {0x6, 0x7, 0x6, 0x6});
+ break;
+ case 6:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 6, new int[] {0x2, 0x4},
+ new int[] {0x5, 0x5});
+ break;
+ case 7:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 7,
+ new int[] {0x4, 0x1, 0x5, 0x2, 0x6, 0x3},
+ new int[] {0x5, 0x4, 0x5, 0x4, 0x5, 0x4});
+ break;
+ case 8:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 8,
+ new int[] {0x8, 0x8, 0x8},
+ new int[] {0x4, 0x4, 0x4});
+ break;
+ case 9:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 9,
+ new int[] {0x5, 0x1, 0x6, 0x2, 0x7, 0x3, 0x8, 0x4},
+ new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x3, 0x4, 0x3});
+ break;
+ case 10:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 10,
+ new int[] {0x2, 0x4, 0x6, 0x8},
+ new int[] {0x3, 0x3, 0x3, 0x3});
+ break;
+ case 11:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 11,
+ new int[] {0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+ new int[] {0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2});
+ break;
+ case 12:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 12,
+ new int[] {0x8, 0x4, 0xc, 0x8, 0x4},
+ new int[] {0x3, 0x2, 0x3, 0x3, 0x2});
+ break;
+ case 13:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 13,
+ new int[] {0x6, 0xc, 0x5, 0xb, 0x4, 0xa, 0x3, 0x9,
+ 0x2, 0x8, 0x1, 0x7},
+ new int[] {0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3,
+ 0x2, 0x3, 0x2, 0x2});
+ break;
+ case 14:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 14,
+ new int[] {0x4, 0x8, 0xc, 0x2, 0x6, 0xa},
+ new int[] {0x2, 0x2, 0x3, 0x2, 0x2, 0x2});
+ break;
+ case 15:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 15,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x1,
+ 0x3, 0x5, 0x7, 0x9, 0xb, 0xd},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2,
+ 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 16:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 16,
+ new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 17:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 17,
+ new int[] {0xf, 0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1,
+ 0x10, 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1,
+ 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 18:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 18,
+ new int[] {0xe, 0xa, 0x6, 0x2, 0x10, 0xc, 0x8, 0x4},
+ new int[] {0x2, 0x2, 0x2, 0x1, 0x2, 0x2, 0x2, 0x1});
+ break;
+ case 19:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 19,
+ new int[] {0xd, 0x7, 0x1, 0xe, 0x8, 0x2, 0xf, 0x9, 0x3, 0x10, 0xa,
+ 0x4, 0x11, 0xb, 0x5, 0x12, 0xc, 0x6},
+ new int[] {0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x1,
+ 0x2, 0x2, 0x1, 0x2, 0x2, 0x1});
+ break;
+ case 20:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 20,
+ new int[] {0xc, 0x4, 0x10, 0x8, 0x14, 0xc, 0x4, 0x10, 0x8},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 21:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 21,
+ new int[] {0xb, 0x1, 0xc, 0x2, 0xd, 0x3, 0xe, 0x4, 0xf, 0x5,
+ 0x10, 0x6, 0x11, 0x7, 0x12, 0x8, 0x13, 0x9, 0x14, 0xa},
+ new int[] {0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1});
+ break;
+ case 22:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 22,
+ new int[] {0xa, 0x14, 0x8, 0x12, 0x6, 0x10, 0x4, 0xe, 0x2, 0xc},
+ new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 23:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 23,
+ new int[] {0x9, 0x12, 0x4, 0xd, 0x16, 0x8, 0x11, 0x3,
+ 0xc, 0x15, 0x7, 0x10, 0x2, 0xb, 0x14, 0x6,
+ 0xf, 0x1, 0xa, 0x13, 0x5, 0xe},
+ new int[] {0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x1, 0x2, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 24:
+ fastpackEvenBitNoMask(
+ in, inpos, out, outpos, 24,
+ new int[] {0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8, 0x10, 0x18, 0x8,
+ 0x10},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1, 0x2, 0x1, 0x1});
+ break;
+ case 25:
+ fastpackOddBitNoMask(in, inpos, out, outpos, 25,
+ new int[] {0x7, 0xe, 0x15, 0x3, 0xa, 0x11,
+ 0x18, 0x6, 0xd, 0x14, 0x2, 0x9,
+ 0x10, 0x17, 0x5, 0xc, 0x13, 0x1,
+ 0x8, 0xf, 0x16, 0x4, 0xb, 0x12},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1});
+ break;
+ case 26:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 26,
+ new int[] {0x6, 0xc, 0x12, 0x18, 0x4, 0xa, 0x10,
+ 0x16, 0x2, 0x8, 0xe, 0x14},
+ new int[] {0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2,
+ 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 27:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 27,
+ new int[] {0x5, 0xa, 0xf, 0x14, 0x19, 0x3, 0x8, 0xd, 0x12,
+ 0x17, 0x1, 0x6, 0xb, 0x10, 0x15, 0x1a, 0x4, 0x9,
+ 0xe, 0x13, 0x18, 0x2, 0x7, 0xc, 0x11, 0x16},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1,
+ 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 28:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 28,
+ new int[] {0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c,
+ 0x4, 0x8, 0xc, 0x10, 0x14, 0x18},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 29:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 29,
+ new int[] {0x3, 0x6, 0x9, 0xc, 0xf, 0x12, 0x15, 0x18, 0x1b, 0x1,
+ 0x4, 0x7, 0xa, 0xd, 0x10, 0x13, 0x16, 0x19, 0x1c, 0x2,
+ 0x5, 0x8, 0xb, 0xe, 0x11, 0x14, 0x17, 0x1a},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 30:
+ fastpackEvenBitNoMask(in, inpos, out, outpos, 30,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10,
+ 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 31:
+ fastpackOddBitNoMask(
+ in, inpos, out, outpos, 31,
+ new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+ 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, 256);
+ break;
+ }
+ }
+
+ public static void fastunpack(final int[] in, int inpos, final int[] out,
+ int outpos, int b) {
+ switch (b) {
+ case 0:
+ Arrays.fill(out, outpos, outpos + 256, 0);
+ break;
+ case 1:
+ fastUnpackOddBit(in, inpos, out, outpos, 1, new int[] {}, new int[] {},
+ new int[] {});
+ break;
+ case 2:
+ fastUnpackEventBit(in, inpos, out, outpos, 2, new int[] {}, new int[] {},
+ new int[] {});
+ break;
+ case 3:
+ fastUnpackOddBit(in, inpos, out, outpos, 3, new int[] {0x1, 0x2},
+ new int[] {0x1, 0x3}, new int[] {0xa, 0xa});
+ break;
+ case 4:
+ fastUnpackEventBit(in, inpos, out, outpos, 4, new int[] {0x4},
+ new int[] {0xf}, new int[] {0x7});
+ break;
+ case 5:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 5, new int[] {0x3, 0x1, 0x4, 0x2},
+ new int[] {0x7, 0x1, 0xf, 0x3}, new int[] {0x5, 0x6, 0x5, 0x6});
+ break;
+ case 6:
+ fastUnpackEventBit(in, inpos, out, outpos, 6, new int[] {0x4, 0x2},
+ new int[] {0xf, 0x3}, new int[] {0x4, 0x5});
+ break;
+ case 7:
+ fastUnpackOddBit(in, inpos, out, outpos, 7,
+ new int[] {0x3, 0x6, 0x2, 0x5, 0x1, 0x4},
+ new int[] {0x7, 0x3f, 0x3, 0x1f, 0x1, 0xf},
+ new int[] {0x4, 0x3, 0x4, 0x3, 0x4, 0x4});
+ break;
+ case 8:
+ fastUnpackEventBit(in, inpos, out, outpos, 8, new int[] {0x8, 0x8, 0x8},
+ new int[] {0xff, 0xff, 0xff},
+ new int[] {0x3, 0x3, 0x3});
+ break;
+ case 9:
+ fastUnpackOddBit(in, inpos, out, outpos, 9,
+ new int[] {0x4, 0x8, 0x3, 0x7, 0x2, 0x6, 0x1, 0x5},
+ new int[] {0xf, 0xff, 0x7, 0x7f, 0x3, 0x3f, 0x1, 0x1f},
+ new int[] {0x3, 0x2, 0x3, 0x2, 0x3, 0x2, 0x3, 0x3});
+ break;
+ case 10:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 10, new int[] {0x8, 0x6, 0x4, 0x2},
+ new int[] {0xff, 0x3f, 0xf, 0x3}, new int[] {0x2, 0x2, 0x2, 0x3});
+ break;
+ case 11:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 11,
+ new int[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa},
+ new int[] {0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff},
+ new int[] {0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2});
+ break;
+ case 12:
+ fastUnpackEventBit(in, inpos, out, outpos, 12,
+ new int[] {0x4, 0x8, 0xc, 0x4, 0x8},
+ new int[] {0xf, 0xff, 0xfff, 0xf, 0xff},
+ new int[] {0x2, 0x2, 0x1, 0x2, 0x2});
+ break;
+ case 13:
+ fastUnpackOddBit(in, inpos, out, outpos, 13,
+ new int[] {0x7, 0x1, 0x8, 0x2, 0x9, 0x3, 0xa, 0x4, 0xb,
+ 0x5, 0xc, 0x6},
+ new int[] {0x7f, 0x1, 0xff, 0x3, 0x1ff, 0x7, 0x3ff, 0xf,
+ 0x7ff, 0x1f, 0xfff, 0x3f},
+ new int[] {0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1,
+ 0x2, 0x1, 0x2});
+ break;
+ case 14:
+ fastUnpackEventBit(in, inpos, out, outpos, 14,
+ new int[] {0xa, 0x6, 0x2, 0xc, 0x8, 0x4},
+ new int[] {0x3ff, 0x3f, 0x3, 0xfff, 0xff, 0xf},
+ new int[] {0x1, 0x1, 0x2, 0x1, 0x1, 0x2});
+ break;
+ case 15:
+ fastUnpackOddBit(in, inpos, out, outpos, 15,
+ new int[] {0xd, 0xb, 0x9, 0x7, 0x5, 0x3, 0x1, 0xe, 0xc,
+ 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0x1fff, 0x7ff, 0x1ff, 0x7f, 0x1f, 0x7, 0x1,
+ 0x3fff, 0xfff, 0x3ff, 0xff, 0x3f, 0xf, 0x3},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x2});
+ break;
+ case 16:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 16,
+ new int[] {0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+ new int[] {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 17:
+ fastUnpackOddBit(in, inpos, out, outpos, 17,
+ new int[] {0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x1,
+ 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf},
+ new int[] {0x3, 0xf, 0x3f, 0xff, 0x3ff, 0xfff, 0x3fff,
+ 0xffff, 0x1, 0x7, 0x1f, 0x7f, 0x1ff, 0x7ff,
+ 0x1fff, 0x7fff},
+ new int[] {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x1,
+ 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 18:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 18,
+ new int[] {0x4, 0x8, 0xc, 0x10, 0x2, 0x6, 0xa, 0xe},
+ new int[] {0xf, 0xff, 0xfff, 0xffff, 0x3, 0x3f, 0x3ff, 0x3fff},
+ new int[] {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1});
+ break;
+ case 19:
+ fastUnpackOddBit(in, inpos, out, outpos, 19,
+ new int[] {0x6, 0xc, 0x12, 0x5, 0xb, 0x11, 0x4, 0xa,
+ 0x10, 0x3, 0x9, 0xf, 0x2, 0x8, 0xe, 0x1, 0x7,
+ 0xd},
+ new int[] {0x3f, 0xfff, 0x3ffff, 0x1f, 0x7ff, 0x1ffff,
+ 0xf, 0x3ff, 0xffff, 0x7, 0x1ff, 0x7fff, 0x3,
+ 0xff, 0x3fff, 0x1, 0x7f, 0x1fff},
+ new int[] {0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x0,
+ 0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1});
+ break;
+ case 20:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 20,
+ new int[] {0x8, 0x10, 0x4, 0xc, 0x14, 0x8, 0x10, 0x4, 0xc},
+ new int[] {0xff, 0xffff, 0xf, 0xfff, 0xfffff, 0xff, 0xffff, 0xf,
+ 0xfff},
+ new int[] {0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1});
+ break;
+ case 21:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 21,
+ new int[] {0xa, 0x14, 0x9, 0x13, 0x8, 0x12, 0x7, 0x11, 0x6, 0x10,
+ 0x5, 0xf, 0x4, 0xe, 0x3, 0xd, 0x2, 0xc, 0x1, 0xb},
+ new int[] {0x3ff, 0xfffff, 0x1ff, 0x7ffff, 0xff, 0x3ffff, 0x7f,
+ 0x1ffff, 0x3f, 0xffff, 0x1f, 0x7fff, 0xf, 0x3fff,
+ 0x7, 0x1fff, 0x3, 0xfff, 0x1, 0x7ff},
+ new int[] {0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0,
+ 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1});
+ break;
+ case 22:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 22,
+ new int[] {0xc, 0x2, 0xe, 0x4, 0x10, 0x6, 0x12, 0x8, 0x14, 0xa},
+ new int[] {0xfff, 0x3, 0x3fff, 0xf, 0xffff, 0x3f, 0x3ffff, 0xff,
+ 0xfffff, 0x3ff},
+ new int[] {0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1});
+ break;
+ case 23:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 23,
+ new int[] {0xe, 0x5, 0x13, 0xa, 0x1, 0xf, 0x6, 0x14,
+ 0xb, 0x2, 0x10, 0x7, 0x15, 0xc, 0x3, 0x11,
+ 0x8, 0x16, 0xd, 0x4, 0x12, 0x9},
+ new int[] {0x3fff, 0x1f, 0x7ffff, 0x3ff, 0x1, 0x7fff,
+ 0x3f, 0xfffff, 0x7ff, 0x3, 0xffff, 0x7f,
+ 0x1fffff, 0xfff, 0x7, 0x1ffff, 0xff, 0x3fffff,
+ 0x1fff, 0xf, 0x3ffff, 0x1ff},
+ new int[] {0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0,
+ 0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1});
+ break;
+ case 24:
+ fastUnpackEventBit(
+ in, inpos, out, outpos, 24,
+ new int[] {0x10, 0x8, 0x18, 0x10, 0x8, 0x18, 0x10, 0x8, 0x18, 0x10,
+ 0x8},
+ new int[] {0xffff, 0xff, 0xffffff, 0xffff, 0xff, 0xffffff, 0xffff,
+ 0xff, 0xffffff, 0xffff, 0xff},
+ new int[] {0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1});
+ break;
+ case 25:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 25,
+ new int[] {0x12, 0xb, 0x4, 0x16, 0xf, 0x8, 0x1, 0x13,
+ 0xc, 0x5, 0x17, 0x10, 0x9, 0x2, 0x14, 0xd,
+ 0x6, 0x18, 0x11, 0xa, 0x3, 0x15, 0xe, 0x7},
+ new int[] {0x3ffff, 0x7ff, 0xf, 0x3fffff, 0x7fff, 0xff,
+ 0x1, 0x7ffff, 0xfff, 0x1f, 0x7fffff, 0xffff,
+ 0x1ff, 0x3, 0xfffff, 0x1fff, 0x3f, 0xffffff,
+ 0x1ffff, 0x3ff, 0x7, 0x1fffff, 0x3fff, 0x7f},
+ new int[] {0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
+ 0x1, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1});
+ break;
+ case 26:
+ fastUnpackEventBit(in, inpos, out, outpos, 26,
+ new int[] {0x14, 0xe, 0x8, 0x2, 0x16, 0x10, 0xa, 0x4,
+ 0x18, 0x12, 0xc, 0x6},
+ new int[] {0xfffff, 0x3fff, 0xff, 0x3, 0x3fffff,
+ 0xffff, 0x3ff, 0xf, 0xffffff, 0x3ffff,
+ 0xfff, 0x3f},
+ new int[] {0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1});
+ break;
+ case 27:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 27,
+ new int[] {0x16, 0x11, 0xc, 0x7, 0x2, 0x18, 0x13, 0xe, 0x9,
+ 0x4, 0x1a, 0x15, 0x10, 0xb, 0x6, 0x1, 0x17, 0x12,
+ 0xd, 0x8, 0x3, 0x19, 0x14, 0xf, 0xa, 0x5},
+ new int[] {0x3fffff, 0x1ffff, 0xfff, 0x7f, 0x3, 0xffffff,
+ 0x7ffff, 0x3fff, 0x1ff, 0xf, 0x3ffffff, 0x1fffff,
+ 0xffff, 0x7ff, 0x3f, 0x1, 0x7fffff, 0x3ffff,
+ 0x1fff, 0xff, 0x7, 0x1ffffff, 0xfffff, 0x7fff,
+ 0x3ff, 0x1f},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0,
+ 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 28:
+ fastUnpackEventBit(in, inpos, out, outpos, 28,
+ new int[] {0x18, 0x14, 0x10, 0xc, 0x8, 0x4, 0x1c, 0x18,
+ 0x14, 0x10, 0xc, 0x8, 0x4},
+ new int[] {0xffffff, 0xfffff, 0xffff, 0xfff, 0xff, 0xf,
+ 0xfffffff, 0xffffff, 0xfffff, 0xffff, 0xfff,
+ 0xff, 0xf},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 29:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 29,
+ new int[] {0x1a, 0x17, 0x14, 0x11, 0xe, 0xb, 0x8, 0x5, 0x2, 0x1c,
+ 0x19, 0x16, 0x13, 0x10, 0xd, 0xa, 0x7, 0x4, 0x1, 0x1b,
+ 0x18, 0x15, 0x12, 0xf, 0xc, 0x9, 0x6, 0x3},
+ new int[] {0x3ffffff, 0x7fffff, 0xfffff, 0x1ffff, 0x3fff,
+ 0x7ff, 0xff, 0x1f, 0x3, 0xfffffff,
+ 0x1ffffff, 0x3fffff, 0x7ffff, 0xffff, 0x1fff,
+ 0x3ff, 0x7f, 0xf, 0x1, 0x7ffffff,
+ 0xffffff, 0x1fffff, 0x3ffff, 0x7fff, 0xfff,
+ 0x1ff, 0x3f, 0x7},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 30:
+ fastUnpackEventBit(in, inpos, out, outpos, 30,
+ new int[] {0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10,
+ 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2},
+ new int[] {0xfffffff, 0x3ffffff, 0xffffff, 0x3fffff,
+ 0xfffff, 0x3ffff, 0xffff, 0x3fff, 0xfff,
+ 0x3ff, 0xff, 0x3f, 0xf, 0x3},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+ case 31:
+ fastUnpackOddBit(
+ in, inpos, out, outpos, 31,
+ new int[] {0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15,
+ 0x14, 0x13, 0x12, 0x11, 0x10, 0xf, 0xe, 0xd, 0xc, 0xb,
+ 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1},
+ new int[] {0x3fffffff, 0x1fffffff, 0xfffffff, 0x7ffffff, 0x3ffffff,
+ 0x1ffffff, 0xffffff, 0x7fffff, 0x3fffff, 0x1fffff,
+ 0xfffff, 0x7ffff, 0x3ffff, 0x1ffff, 0xffff,
+ 0x7fff, 0x3fff, 0x1fff, 0xfff, 0x7ff,
+ 0x3ff, 0x1ff, 0xff, 0x7f, 0x3f,
+ 0x1f, 0xf, 0x7, 0x3, 0x1},
+ new int[] {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1});
+ break;
+
+ case 32:
+ System.arraycopy(in, inpos, out, outpos, 256);
+ break;
+ }
+ }
+}
diff --git a/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java
new file mode 100644
index 0000000..7374fa5
--- /dev/null
+++ b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java
@@ -0,0 +1,366 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ * (c) Intel Corp. (for Vector implementation)
+ */
+package me.lemire.integercompression.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import me.lemire.integercompression.IntegerCODEC;
+import me.lemire.integercompression.SkippableIntegerCODEC;
+import me.lemire.integercompression.IntWrapper;
+
+/**
+ * This is a patching scheme designed for speed.
+ * It encodes integers in blocks of integers within pages of
+ * up to 65536 integers. Note that it is important, to get good
+ * compression and good performance, to use sizeable arrays (greater than 1024
+ * integers). For arrays containing a number of integers that is not divisible
+ * by BLOCK_SIZE, you should use it in conjunction with another CODEC:
+ *
+ * IntegerCODEC ic = new Composition(new VectorFastPFOR(), new VariableByte()).
+ *