/*
 * Decompiled with CFR 0.152.
 */
package org.graalvm.compiler.lir.aarch64;

import java.util.Arrays;
import jdk.vm.ci.aarch64.AArch64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.Value;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.aarch64.AArch64ASIMDAssembler;
import org.graalvm.compiler.asm.aarch64.AArch64Address;
import org.graalvm.compiler.asm.aarch64.AArch64Assembler;
import org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler;
import org.graalvm.compiler.core.common.Stride;
import org.graalvm.compiler.core.common.StrideUtil;
import org.graalvm.compiler.debug.GraalError;
import org.graalvm.compiler.lir.LIRInstruction;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.aarch64.AArch64ComplexVectorOp;
import org.graalvm.compiler.lir.aarch64.AArch64ControlFlow;
import org.graalvm.compiler.lir.aarch64.AArch64LIRInstruction;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;

@Opcode(value="ARRAY_EQUALS")
public final class AArch64ArrayEqualsOp
extends AArch64ComplexVectorOp {
    public static final LIRInstructionClass<AArch64ArrayEqualsOp> TYPE = LIRInstructionClass.create(AArch64ArrayEqualsOp.class);
    private final Stride argStrideA;
    private final Stride argStrideB;
    private final Stride argStrideM;
    @LIRInstruction.Def(value={LIRInstruction.OperandFlag.REG})
    protected Value resultValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value arrayAValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value offsetAValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value arrayBValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value offsetBValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value lengthValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    protected Value arrayMaskValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value dynamicStridesValue;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    protected Value[] temp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    protected Value[] vectorTemp;

    public AArch64ArrayEqualsOp(LIRGeneratorTool tool, Stride strideA, Stride strideB, Stride strideM, Value result, Value arrayA, Value offsetA, Value arrayB, Value offsetB, Value length, Value mask, Value dynamicStrides) {
        super((LIRInstructionClass<? extends AArch64LIRInstruction>)TYPE);
        this.argStrideA = strideA;
        this.argStrideB = strideB;
        this.argStrideM = strideM;
        if (strideM != null && strideM != strideB) {
            GraalError.guarantee(strideA == Stride.S2 && strideB == Stride.S2 && strideM == Stride.S1 || strideA == Stride.S1 && strideB == Stride.S2 && strideM == Stride.S1, "The only supported cases where strideMask is not equal to strideB are : S2 - S2 - S1 and S1 - S2 - S1");
        }
        GraalError.guarantee(result.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        GraalError.guarantee(arrayA.getPlatformKind() == AArch64Kind.QWORD && arrayA.getPlatformKind() == arrayB.getPlatformKind(), "pointer value expected");
        GraalError.guarantee(offsetA.getPlatformKind() == AArch64Kind.QWORD, "long value expected");
        GraalError.guarantee(offsetB.getPlatformKind() == AArch64Kind.QWORD, "long value expected");
        GraalError.guarantee(length.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        GraalError.guarantee(dynamicStrides == null || dynamicStrides.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        this.resultValue = result;
        this.arrayAValue = arrayA;
        this.offsetAValue = offsetA;
        this.arrayBValue = arrayB;
        this.offsetBValue = offsetB;
        this.lengthValue = length;
        this.arrayMaskValue = mask == null ? Value.ILLEGAL : mask;
        this.dynamicStridesValue = dynamicStrides == null ? Value.ILLEGAL : dynamicStrides;
        this.temp = AArch64ArrayEqualsOp.allocateTempRegisters(tool, 2 + (this.withMask() ? 1 : 0) + (this.withDynamicStrides() ? 1 : 0));
        this.vectorTemp = AArch64ArrayEqualsOp.allocateConsecutiveVectorRegisters(tool, this.withMask() ? 12 : 8);
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
        try (AArch64MacroAssembler.ScratchRegister sc1 = masm.getScratchRegister();
             AArch64MacroAssembler.ScratchRegister sc2 = masm.getScratchRegister();){
            Register ret = ValueUtil.asRegister((Value)this.resultValue);
            Register arrayA = sc1.getRegister();
            Register arrayB = sc2.getRegister();
            Register length = ValueUtil.asRegister((Value)this.temp[0]);
            Register tmp = ValueUtil.asRegister((Value)this.temp[1]);
            Register mask = this.withMask() ? ValueUtil.asRegister((Value)this.temp[2]) : null;
            Label end = new Label();
            masm.add(64, arrayA, ValueUtil.asRegister((Value)this.arrayAValue), ValueUtil.asRegister((Value)this.offsetAValue));
            masm.add(64, arrayB, ValueUtil.asRegister((Value)this.arrayBValue), ValueUtil.asRegister((Value)this.offsetBValue));
            if (this.withMask()) {
                masm.mov(64, mask, ValueUtil.asRegister((Value)this.arrayMaskValue));
            }
            masm.mov(32, length, ValueUtil.asRegister((Value)this.lengthValue));
            if (this.withDynamicStrides()) {
                Label[] variants = new Label[9];
                for (int i = 0; i < variants.length; ++i) {
                    variants[i] = new Label();
                }
                Register tmp2 = ValueUtil.asRegister((Value)this.temp[this.withMask() ? 3 : 2]);
                masm.mov(32, tmp2, ValueUtil.asRegister((Value)this.dynamicStridesValue));
                AArch64ControlFlow.RangeTableSwitchOp.emitJumpTable(crb, masm, tmp, tmp2, 0, 8, Arrays.stream(variants));
                masm.align(16);
                masm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S4, Stride.S4)]);
                masm.lsl(64, length, length, 1L);
                masm.align(16);
                masm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S2, Stride.S2)]);
                masm.lsl(64, length, length, 1L);
                masm.align(16);
                masm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S1, Stride.S1)]);
                this.emitArrayEquals(masm, Stride.S1, Stride.S1, Stride.S1, arrayA, arrayB, mask, length, tmp, ret, end);
                masm.jmp(end);
                for (Stride strideA : new Stride[]{Stride.S1, Stride.S2, Stride.S4}) {
                    for (Stride strideB : new Stride[]{Stride.S1, Stride.S2, Stride.S4}) {
                        if (strideA.log2 == strideB.log2 || !this.withMask() && strideA.log2 < strideB.log2) continue;
                        if (!this.withMask()) {
                            masm.align(16);
                            masm.bind(variants[StrideUtil.getDirectStubCallIndex(strideB, strideA)]);
                            masm.mov(64, tmp, arrayA);
                            masm.mov(64, arrayA, arrayB);
                            masm.mov(64, arrayB, tmp);
                        }
                        masm.align(16);
                        masm.bind(variants[StrideUtil.getDirectStubCallIndex(strideA, strideB)]);
                        this.emitArrayEquals(masm, strideA, strideB, strideB, arrayA, arrayB, mask, length, tmp, ret, end);
                        masm.jmp(end);
                    }
                }
            } else {
                this.emitArrayEquals(masm, this.argStrideA, this.argStrideB, this.argStrideM, arrayA, arrayB, mask, length, tmp, ret, end);
            }
            masm.align(16);
            masm.bind(end);
            masm.cset(32, ret, AArch64Assembler.ConditionFlag.EQ);
        }
    }

    private boolean withMask() {
        return !ValueUtil.isIllegal((Value)this.arrayMaskValue);
    }

    private boolean withDynamicStrides() {
        return !ValueUtil.isIllegal((Value)this.dynamicStridesValue);
    }

    private void emitArrayEquals(AArch64MacroAssembler asm, Stride strideA, Stride strideB, Stride strideM, Register arrayA, Register arrayB, Register arrayM, Register len, Register tmp, Register ret, Label end) {
        Label tailLessThan64 = new Label();
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailLessThan4 = new Label();
        Label tailLessThan2 = new Label();
        Label vectorLoop = new Label();
        Label tail = new Label();
        Register arrayMax = strideB.value < strideA.value ? arrayA : arrayB;
        Register arrayMin = strideB.value < strideA.value ? arrayB : arrayA;
        Stride strideMax = Stride.max(strideB, strideA);
        Stride strideMin = Stride.min(strideB, strideA);
        asm.subs(64, len, len, 64 >> strideMax.log2);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, tailLessThan64);
        Register refAddress = len;
        asm.add(64, refAddress, arrayMax, len, AArch64Assembler.ShiftType.LSL, strideMax.log2);
        this.simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.NE, end);
        asm.cmp(64, refAddress, arrayMax);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.LS, tail);
        asm.and(64, tmp, arrayMax, 63L);
        asm.sub(64, arrayMin, arrayMin, tmp, AArch64Assembler.ShiftType.LSR, strideMax.log2 - strideMin.log2);
        if (this.withMask()) {
            asm.sub(64, arrayM, arrayM, tmp, AArch64Assembler.ShiftType.LSR, strideMax.log2 - strideM.log2);
        }
        asm.bic(64, arrayMax, arrayMax, 63L);
        asm.align(16);
        asm.bind(vectorLoop);
        this.simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.NE, end);
        asm.cmp(64, arrayMax, refAddress);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.LO, vectorLoop);
        asm.bind(tail);
        asm.sub(64, tmp, arrayMax, refAddress);
        asm.mov(64, arrayMax, refAddress);
        asm.sub(64, arrayMin, arrayMin, tmp, AArch64Assembler.ShiftType.LSR, strideMax.log2 - strideMin.log2);
        if (this.withMask()) {
            asm.sub(64, arrayM, arrayM, tmp, AArch64Assembler.ShiftType.LSR, strideMax.log2 - strideM.log2);
        }
        this.simdCompare64(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM);
        asm.jmp(end);
        this.tail32(asm, strideMax, strideMin, strideA, strideM, arrayMax, arrayMin, arrayM, len, tailLessThan64, tailLessThan32, end);
        this.tail16(asm, strideA, strideB, strideM, strideMax, strideMin, arrayA, arrayB, arrayM, len, tailLessThan32, tailLessThan16, end);
        this.tailLessThan16(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, tmp, ret, tailLessThan16, tailLessThan8, end, 8);
        this.tailLessThan16(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, tmp, ret, tailLessThan8, tailLessThan4, end, 4);
        this.tailLessThan16(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, tmp, ret, tailLessThan4, tailLessThan2, end, 2);
        this.tailLessThan16(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, tmp, ret, tailLessThan2, null, end, 1);
    }

    private Register v(int index) {
        return ValueUtil.asRegister((Value)this.vectorTemp[index]);
    }

    private void simdCompare64(AArch64MacroAssembler asm, Stride strideMax, Stride strideMin, Stride strideA, Stride strideMask, Register arrayMax, Register arrayMin, Register arrayMask) {
        AArch64ASIMDAssembler.ElementSize minESize = AArch64ASIMDAssembler.ElementSize.fromStride(strideMin);
        switch (strideMax.log2 - strideMin.log2) {
            case 0: {
                asm.fldp(128, this.v(0), this.v(1), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMin, 32));
                asm.fldp(128, this.v(2), this.v(3), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMax, 32));
                if (this.withMask()) {
                    asm.fldp(128, this.v(8), this.v(9), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMask, 32));
                }
                asm.fldp(128, this.v(4), this.v(5), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMin, 32));
                asm.fldp(128, this.v(6), this.v(7), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMax, 32));
                if (this.withMask()) {
                    int arrayA2;
                    int arrayA1 = strideA == strideMin ? 0 : 2;
                    int n = arrayA2 = strideA == strideMin ? 1 : 3;
                    if (strideMask == strideMax) {
                        asm.fldp(128, this.v(10), this.v(11), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMask, 32));
                    } else {
                        assert (strideMax == Stride.S2 && strideMask == Stride.S1);
                        asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(10), this.v(9));
                        asm.neon.uxtl2VV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(11), this.v(9));
                        asm.neon.uxtl2VV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(9), this.v(8));
                        asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(8), this.v(8));
                    }
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA1), this.v(arrayA1), this.v(8));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA2), this.v(arrayA2), this.v(9));
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(2));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(3));
                if (this.withMask()) {
                    int arrayA3 = strideA == strideMin ? 4 : 6;
                    int arrayA4 = strideA == strideMin ? 5 : 7;
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA3), this.v(arrayA3), this.v(10));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA4), this.v(arrayA4), this.v(11));
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(4), this.v(4), this.v(6));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(5), this.v(5), this.v(7));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(1));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(4), this.v(4), this.v(5));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(4));
                break;
            }
            case 1: {
                asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(0), this.v(1), AArch64Address.createStructureImmediatePostIndexAddress(AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R, AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, arrayMax, 32));
                asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(2), this.v(3), AArch64Address.createStructureImmediatePostIndexAddress(AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R, AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, arrayMax, 32));
                if (this.withMask()) {
                    if (strideMask == strideMin) {
                        asm.fldp(128, this.v(6), this.v(7), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMask, 32));
                    } else {
                        asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(6), this.v(7), AArch64Address.createStructureImmediatePostIndexAddress(AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R, AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, arrayMask, 32));
                        asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(8), this.v(9), AArch64Address.createStructureImmediatePostIndexAddress(AArch64ASIMDAssembler.ASIMDInstruction.LD2_MULTIPLE_2R, AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, arrayMask, 32));
                    }
                }
                asm.fldp(128, this.v(4), this.v(5), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayMin, 32));
                if (this.withMask()) {
                    int arrayA1 = strideA == strideMax ? 0 : 4;
                    int arrayA2 = strideA == strideMax ? 2 : 5;
                    int arrayM1 = 6;
                    int arrayM2 = strideMask == strideMin ? 7 : 8;
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA1), this.v(arrayA1), this.v(arrayM1));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA2), this.v(arrayA2), this.v(arrayM2));
                    if (strideMask == strideMax) {
                        if (strideA == strideMax) {
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(7));
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(9));
                        } else {
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(7));
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(9));
                        }
                    }
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(4));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(5));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(1));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(3));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(2));
                break;
            }
            case 2: {
                asm.neon.ld4MultipleVVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(0), this.v(1), this.v(2), this.v(3), AArch64Address.createStructureImmediatePostIndexAddress(AArch64ASIMDAssembler.ASIMDInstruction.LD4_MULTIPLE_4R, AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, arrayMax, 64));
                if (this.withMask()) {
                    if (strideMask == strideMin) {
                        asm.fldr(128, this.v(5), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, arrayMask, 16));
                    } else {
                        asm.neon.ld4MultipleVVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(5), this.v(6), this.v(7), this.v(8), AArch64Address.createStructureImmediatePostIndexAddress(AArch64ASIMDAssembler.ASIMDInstruction.LD4_MULTIPLE_4R, AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, arrayMask, 64));
                    }
                }
                asm.fldr(128, this.v(4), AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, arrayMin, 16));
                if (this.withMask()) {
                    int arrayA1 = strideA == strideMax ? 0 : 4;
                    int arrayM1 = 5;
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA1), this.v(arrayA1), this.v(arrayM1));
                    if (strideMask == strideMax) {
                        if (strideA == strideMax) {
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(6));
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(7));
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(8));
                        } else {
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(6));
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(7));
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(8));
                        }
                    }
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(4));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(3));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(1));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(2));
                break;
            }
            default: {
                throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented");
            }
        }
        AArch64ArrayEqualsOp.vectorCheckZero(asm, this.v(0), this.v(0));
    }

    private void tail32(AArch64MacroAssembler asm, Stride strideMax, Stride strideMin, Stride strideA, Stride strideMask, Register arrayMax, Register arrayMin, Register arrayMask, Register len, Label entry, Label nextTail, Label end) {
        asm.bind(entry);
        asm.adds(64, len, len, 32 >> strideMax.log2);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, nextTail);
        AArch64ASIMDAssembler.ElementSize minESize = AArch64ASIMDAssembler.ElementSize.fromStride(strideMin);
        switch (strideMax.log2 - strideMin.log2) {
            case 0: {
                asm.fldp(128, this.v(0), this.v(1), AArch64Address.createPairBaseRegisterOnlyAddress(128, arrayMin));
                asm.fldp(128, this.v(2), this.v(3), AArch64Address.createPairBaseRegisterOnlyAddress(128, arrayMax));
                if (this.withMask()) {
                    if (strideMask == strideMax) {
                        asm.fldp(128, this.v(8), this.v(9), AArch64Address.createPairBaseRegisterOnlyAddress(128, arrayMask));
                    } else {
                        asm.fldr(128, this.v(8), AArch64Address.createBaseRegisterOnlyAddress(128, arrayMask));
                    }
                }
                asm.add(64, arrayMin, arrayMin, len, AArch64Assembler.ShiftType.LSL, strideMin.log2);
                asm.add(64, arrayMax, arrayMax, len, AArch64Assembler.ShiftType.LSL, strideMax.log2);
                if (this.withMask()) {
                    asm.add(64, arrayMask, arrayMask, len, AArch64Assembler.ShiftType.LSL, strideMask.log2);
                }
                asm.fldp(128, this.v(4), this.v(5), AArch64Address.createPairBaseRegisterOnlyAddress(128, arrayMin));
                asm.fldp(128, this.v(6), this.v(7), AArch64Address.createPairBaseRegisterOnlyAddress(128, arrayMax));
                if (this.withMask()) {
                    int arrayA2;
                    int arrayA1 = strideA == strideMin ? 0 : 2;
                    int n = arrayA2 = strideA == strideMin ? 1 : 3;
                    if (strideMask == strideMax) {
                        asm.fldp(128, this.v(10), this.v(11), AArch64Address.createPairBaseRegisterOnlyAddress(128, arrayMask));
                    } else {
                        asm.fldr(128, this.v(10), AArch64Address.createBaseRegisterOnlyAddress(128, arrayMask));
                        assert (strideMax == Stride.S2 && strideMask == Stride.S1);
                        asm.neon.uxtl2VV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(9), this.v(8));
                        asm.neon.uxtl2VV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(11), this.v(10));
                        asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(8), this.v(8));
                        asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.Byte, this.v(10), this.v(10));
                    }
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA1), this.v(arrayA1), this.v(8));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA2), this.v(arrayA2), this.v(9));
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(2));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(3));
                if (this.withMask()) {
                    int arrayA3 = strideA == strideMin ? 4 : 6;
                    int arrayA4 = strideA == strideMin ? 5 : 7;
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA3), this.v(arrayA3), this.v(10));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA4), this.v(arrayA4), this.v(11));
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(4), this.v(4), this.v(6));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(5), this.v(5), this.v(7));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(1));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(4), this.v(4), this.v(5));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(4));
                break;
            }
            case 1: {
                asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(0), this.v(1), AArch64Address.createStructureNoOffsetAddress(arrayMax));
                asm.fldr(128, this.v(4), AArch64Address.createBaseRegisterOnlyAddress(128, arrayMin));
                if (this.withMask()) {
                    if (strideMask == strideMin) {
                        asm.fldr(128, this.v(6), AArch64Address.createBaseRegisterOnlyAddress(128, arrayMask));
                    } else {
                        asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(6), this.v(7), AArch64Address.createStructureNoOffsetAddress(arrayMask));
                    }
                }
                asm.add(64, arrayMin, arrayMin, len, AArch64Assembler.ShiftType.LSL, strideMin.log2);
                asm.add(64, arrayMax, arrayMax, len, AArch64Assembler.ShiftType.LSL, strideMax.log2);
                if (this.withMask()) {
                    asm.add(64, arrayMask, arrayMask, len, AArch64Assembler.ShiftType.LSL, strideMask.log2);
                }
                asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(2), this.v(3), AArch64Address.createStructureNoOffsetAddress(arrayMax));
                asm.fldr(128, this.v(5), AArch64Address.createBaseRegisterOnlyAddress(128, arrayMin));
                if (this.withMask()) {
                    if (strideMask == strideMin) {
                        asm.fldr(128, this.v(7), AArch64Address.createBaseRegisterOnlyAddress(128, arrayMask));
                    } else {
                        asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize, this.v(8), this.v(9), AArch64Address.createStructureNoOffsetAddress(arrayMask));
                    }
                    int arrayA1 = strideA == strideMax ? 0 : 4;
                    int arrayA2 = strideA == strideMax ? 2 : 5;
                    int arrayM1 = 6;
                    int arrayM2 = strideMask == strideMin ? 7 : 8;
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA1), this.v(arrayA1), this.v(arrayM1));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA2), this.v(arrayA2), this.v(arrayM2));
                    if (strideMask == strideMax) {
                        if (strideA == strideMax) {
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(7));
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(9));
                        } else {
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(7));
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(9));
                        }
                    }
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(4));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(5));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(1));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(3));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(2));
                break;
            }
            case 2: {
                asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize.expand(), this.v(0), this.v(1), AArch64Address.createStructureNoOffsetAddress(arrayMax));
                asm.fldr(64, this.v(4), AArch64Address.createBaseRegisterOnlyAddress(64, arrayMin));
                if (this.withMask()) {
                    if (strideMask == strideMin) {
                        asm.fldr(64, this.v(6), AArch64Address.createBaseRegisterOnlyAddress(64, arrayMask));
                    } else {
                        asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize.expand(), this.v(6), this.v(7), AArch64Address.createStructureNoOffsetAddress(arrayMask));
                    }
                }
                asm.add(64, arrayMin, arrayMin, len, AArch64Assembler.ShiftType.LSL, strideMin.log2);
                asm.add(64, arrayMax, arrayMax, len, AArch64Assembler.ShiftType.LSL, strideMax.log2);
                if (this.withMask()) {
                    asm.add(64, arrayMask, arrayMask, len, AArch64Assembler.ShiftType.LSL, strideMask.log2);
                }
                asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize.expand(), this.v(2), this.v(3), AArch64Address.createStructureNoOffsetAddress(arrayMax));
                asm.fldr(64, this.v(5), AArch64Address.createBaseRegisterOnlyAddress(64, arrayMin));
                asm.neon.uxtlVV(minESize, this.v(4), this.v(4));
                asm.neon.uxtlVV(minESize, this.v(5), this.v(5));
                if (this.withMask()) {
                    if (strideMask == strideMin) {
                        asm.fldr(64, this.v(7), AArch64Address.createBaseRegisterOnlyAddress(64, arrayMask));
                        asm.neon.uxtlVV(minESize, this.v(6), this.v(6));
                        asm.neon.uxtlVV(minESize, this.v(7), this.v(7));
                    } else {
                        asm.neon.ld2MultipleVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, minESize.expand(), this.v(8), this.v(9), AArch64Address.createStructureNoOffsetAddress(arrayMask));
                    }
                    int arrayA1 = strideA == strideMax ? 0 : 4;
                    int arrayA2 = strideA == strideMax ? 2 : 5;
                    int arrayM1 = 6;
                    int arrayM2 = strideMask == strideMin ? 7 : 8;
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA1), this.v(arrayA1), this.v(arrayM1));
                    asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(arrayA2), this.v(arrayA2), this.v(arrayM2));
                    if (strideMask == strideMax) {
                        if (strideA == strideMax) {
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(7));
                            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(9));
                        } else {
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(1), this.v(1), this.v(7));
                            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(3), this.v(3), this.v(9));
                        }
                    }
                }
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(4));
                asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(5));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(1));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(2), this.v(2), this.v(3));
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, this.v(0), this.v(0), this.v(2));
                break;
            }
            default: {
                throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented");
            }
        }
        AArch64ArrayEqualsOp.vectorCheckZero(asm, this.v(0), this.v(0));
        asm.jmp(end);
    }

    private void tail16(AArch64MacroAssembler asm, Stride strideA, Stride strideB, Stride strideM, Stride strideMax, Stride strideMin, Register arrayA, Register arrayB, Register arrayM, Register len, Label entry, Label nextTail, Label end) {
        Register vecArrayA1 = this.v(0);
        Register vecArrayA2 = this.v(1);
        Register vecArrayB1 = this.v(2);
        Register vecArrayB2 = this.v(3);
        Register vecArrayM1 = this.withMask() ? this.v(4) : null;
        Register vecArrayM2 = this.withMask() ? this.v(5) : null;
        this.tailLoad(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, vecArrayA1, vecArrayA2, vecArrayB1, vecArrayB2, vecArrayM1, vecArrayM2, entry, nextTail, 16);
        AArch64ASIMDAssembler.ElementSize minESize = AArch64ASIMDAssembler.ElementSize.fromStride(strideMin);
        Register vecArrayMin1 = strideA == strideMin ? vecArrayA1 : vecArrayB1;
        Register vecArrayMin2 = strideA == strideMin ? vecArrayA2 : vecArrayB2;
        switch (strideMax.log2 - strideMin.log2) {
            case 0: {
                if (!this.withMask() || strideM.value >= strideMin.value) break;
                asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.fromStride(strideM), vecArrayM1, vecArrayM1);
                asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.fromStride(strideM), vecArrayM2, vecArrayM2);
                break;
            }
            case 1: {
                asm.neon.uxtlVV(minESize, vecArrayMin1, vecArrayMin1);
                asm.neon.uxtlVV(minESize, vecArrayMin2, vecArrayMin2);
                if (!this.withMask() || strideM != strideMin) break;
                asm.neon.uxtlVV(minESize, vecArrayM1, vecArrayM1);
                asm.neon.uxtlVV(minESize, vecArrayM2, vecArrayM2);
                break;
            }
            case 2: {
                asm.neon.uxtlVV(minESize, vecArrayMin1, vecArrayMin1);
                asm.neon.uxtlVV(minESize, vecArrayMin2, vecArrayMin2);
                asm.neon.uxtlVV(minESize.expand(), vecArrayMin1, vecArrayMin1);
                asm.neon.uxtlVV(minESize.expand(), vecArrayMin2, vecArrayMin2);
                if (!this.withMask() || strideM != strideMin) break;
                asm.neon.uxtlVV(minESize, vecArrayM1, vecArrayM1);
                asm.neon.uxtlVV(minESize, vecArrayM2, vecArrayM2);
                asm.neon.uxtlVV(minESize.expand(), vecArrayM1, vecArrayM1);
                asm.neon.uxtlVV(minESize.expand(), vecArrayM2, vecArrayM2);
                break;
            }
            default: {
                throw GraalError.unimplemented("comparison of " + strideMin + " to " + strideMax + " not implemented");
            }
        }
        if (this.withMask()) {
            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayM1);
            asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA2, vecArrayA2, vecArrayM2);
        }
        asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayB1);
        asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA2, vecArrayA2, vecArrayB2);
        asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayA2);
        AArch64ArrayEqualsOp.vectorCheckZero(asm, vecArrayA1, vecArrayA1);
        asm.jmp(end);
    }

    private void tailLoad(AArch64MacroAssembler asm, Stride strideA, Stride strideB, Stride strideM, Stride strideMax, Register arrayA, Register arrayB, Register arrayM, Register len, Register vecArrayA1, Register vecArrayA2, Register vecArrayB1, Register vecArrayB2, Register vecArrayM1, Register vecArrayM2, Label entry, Label nextTail, int nBytes) {
        int bitsA = AArch64ArrayEqualsOp.loadBits(strideA, strideMax, nBytes);
        int bitsB = AArch64ArrayEqualsOp.loadBits(strideB, strideMax, nBytes);
        int bitsM = AArch64ArrayEqualsOp.loadBits(strideM, strideMax, nBytes);
        asm.bind(entry);
        asm.adds(64, len, len, nBytes >> strideMax.log2);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, nextTail);
        asm.fldr(bitsA, vecArrayA1, AArch64Address.createBaseRegisterOnlyAddress(bitsA, arrayA));
        asm.fldr(bitsB, vecArrayB1, AArch64Address.createBaseRegisterOnlyAddress(bitsB, arrayB));
        if (this.withMask()) {
            asm.fldr(bitsM, vecArrayM1, AArch64Address.createBaseRegisterOnlyAddress(bitsM, arrayM));
        }
        asm.add(64, arrayA, arrayA, len, AArch64Assembler.ShiftType.LSL, strideA.log2);
        asm.add(64, arrayB, arrayB, len, AArch64Assembler.ShiftType.LSL, strideB.log2);
        if (this.withMask()) {
            asm.add(64, arrayM, arrayM, len, AArch64Assembler.ShiftType.LSL, strideM.log2);
        }
        asm.fldr(bitsA, vecArrayA2, AArch64Address.createBaseRegisterOnlyAddress(bitsA, arrayA));
        asm.fldr(bitsB, vecArrayB2, AArch64Address.createBaseRegisterOnlyAddress(bitsB, arrayB));
        if (this.withMask()) {
            asm.fldr(bitsM, vecArrayM2, AArch64Address.createBaseRegisterOnlyAddress(bitsM, arrayM));
        }
    }

    private void tailLessThan16(AArch64MacroAssembler asm, Stride strideA, Stride strideB, Stride strideM, Stride strideMax, Register arrayA, Register arrayB, Register arrayM, Register len, Register tmp, Register ret, Label entry, Label nextTail, Label end, int nBytes) {
        Register vecArrayM2;
        Register vecArrayA1 = this.v(0);
        Register vecArrayA2 = this.v(1);
        Register vecArrayB1 = this.v(2);
        Register vecArrayB2 = this.v(3);
        Register vecArrayM1 = this.withMask() ? this.v(4) : null;
        Register register = vecArrayM2 = this.withMask() ? this.v(5) : null;
        assert (nBytes <= 8);
        int bitsA = AArch64ArrayEqualsOp.loadBits(strideA, strideMax, nBytes);
        int bitsB = AArch64ArrayEqualsOp.loadBits(strideB, strideMax, nBytes);
        int bitsM = AArch64ArrayEqualsOp.loadBits(strideM, strideMax, nBytes);
        if (strideMax.value < nBytes) {
            this.tailLoad(asm, strideA, strideB, strideM, strideMax, arrayA, arrayB, arrayM, len, vecArrayA1, vecArrayA2, vecArrayB1, vecArrayB2, vecArrayM1, vecArrayM2, entry, nextTail, nBytes);
            asm.neon.insXX(AArch64ASIMDAssembler.ElementSize.fromSize(bitsA), vecArrayA1, 1, vecArrayA2, 0);
            asm.neon.insXX(AArch64ASIMDAssembler.ElementSize.fromSize(bitsB), vecArrayB1, 1, vecArrayB2, 0);
            if (this.withMask()) {
                asm.neon.insXX(AArch64ASIMDAssembler.ElementSize.fromSize(bitsM), vecArrayM1, 1, vecArrayM2, 0);
            }
            AArch64ArrayEqualsOp.tailExtend(asm, strideA, strideMax, vecArrayA1);
            AArch64ArrayEqualsOp.tailExtend(asm, strideB, strideMax, vecArrayB1);
            if (this.withMask()) {
                AArch64ArrayEqualsOp.tailExtend(asm, strideM, strideMax, vecArrayM1);
                asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayM1);
            }
            asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayB1);
            AArch64ArrayEqualsOp.vectorCheckZero(asm, vecArrayA1, vecArrayA1);
        } else if (strideMax.value == nBytes) {
            asm.bind(entry);
            asm.compare(64, len, -2);
            asm.branchConditionally(AArch64Assembler.ConditionFlag.EQ, end);
            asm.ldr(strideA.getBitCount(), tmp, AArch64Address.createBaseRegisterOnlyAddress(strideA.getBitCount(), arrayA));
            if (this.withMask()) {
                asm.ldr(strideM.getBitCount(), ret, AArch64Address.createBaseRegisterOnlyAddress(strideM.getBitCount(), arrayM));
                asm.orr(64, tmp, tmp, ret);
            }
            asm.ldr(strideB.getBitCount(), ret, AArch64Address.createBaseRegisterOnlyAddress(strideB.getBitCount(), arrayB));
            asm.cmp(64, tmp, ret);
        }
        asm.jmp(end);
    }

    private static void tailExtend(AArch64MacroAssembler asm, Stride stride, Stride strideMax, Register vecArray) {
        switch (strideMax.log2 - stride.log2) {
            case 0: {
                break;
            }
            case 1: {
                asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.fromStride(stride), vecArray, vecArray);
                break;
            }
            case 2: {
                asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.fromStride(stride), vecArray, vecArray);
                asm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.fromStride(stride).expand(), vecArray, vecArray);
                break;
            }
            default: {
                throw GraalError.shouldNotReachHere();
            }
        }
    }

    private static int loadBits(Stride strideA, Stride strideMax, int nBytes) {
        return nBytes * 8 >> strideMax.log2 - strideA.log2;
    }
}

