crypto: p10-aes-gcm - Supporting functions for AES
authorDanny Tsen <dtsen@linux.ibm.com>
Tue, 21 Feb 2023 03:40:18 +0000 (22:40 -0500)
committerHerbert Xu <herbert@gondor.apana.org.au>
Tue, 14 Mar 2023 09:06:43 +0000 (17:06 +0800)
This code is taken from CRYPTOGAMs[1].  The following functions are used,
aes_p8_set_encrypt_key is used to generate AES round keys and aes_p8_encrypt is used
to encrypt single block.

Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/powerpc/crypto/aesp8-ppc.pl [new file with mode: 0644]

diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl
new file mode 100644 (file)
index 0000000..1f22aec
--- /dev/null
@@ -0,0 +1,585 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#       * Redistributions of source code must retain copyright notices,
+#         this list of conditions and the following disclaimer.
+#
+#       * Redistributions in binary form must reproduce the above
+#         copyright notice, this list of conditions and the following
+#         disclaimer in the documentation and/or other materials
+#         provided with the distribution.
+#
+#       * Neither the name of the CRYPTOGAMS nor the names of its
+#         copyright holder and contributors may be used to endorse or
+#         promote products derived from this software without specific
+#         prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#              CBC en-/decrypt CTR     XTS
+# POWER8[le]   3.96/0.72       0.74    1.1
+# POWER8[be]   3.75/0.65       0.66    1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+       $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
+       $STU    ="stdu";
+       $POP    ="ld";
+       $PUSH   ="std";
+       $UCMP   ="cmpld";
+       $SHL    ="sldi";
+} elsif ($flavour =~ /32/) {
+       $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
+       $STU    ="stwu";
+       $POP    ="lwz";
+       $PUSH   ="stw";
+       $UCMP   ="cmplw";
+       $SHL    ="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{    # Key setup procedures                                          #
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine       "any"
+
+.text
+
+.align 7
+rcon:
+.long  0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
+.long  0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
+.long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
+.long  0,0,0,0                                         ?asis
+Lconsts:
+       mflr    r0
+       bcl     20,31,\$+4
+       mflr    $ptr     #vvvvv "distance between . and rcon
+       addi    $ptr,$ptr,-0x48
+       mtlr    r0
+       blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+Lset_encrypt_key:
+       mflr            r11
+       $PUSH           r11,$LRSAVE($sp)
+
+       li              $ptr,-1
+       ${UCMP}i        $inp,0
+       beq-            Lenc_key_abort          # if ($inp==0) return -1;
+       ${UCMP}i        $out,0
+       beq-            Lenc_key_abort          # if ($out==0) return -1;
+       li              $ptr,-2
+       cmpwi           $bits,128
+       blt-            Lenc_key_abort
+       cmpwi           $bits,256
+       bgt-            Lenc_key_abort
+       andi.           r0,$bits,0x3f
+       bne-            Lenc_key_abort
+
+       lis             r0,0xfff0
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       bl              Lconsts
+       mtlr            r11
+
+       neg             r9,$inp
+       lvx             $in0,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       lvsr            $key,0,r9               # borrow $key
+       li              r8,0x20
+       cmpwi           $bits,192
+       lvx             $in1,0,$inp
+       le?vspltisb     $mask,0x0f              # borrow $mask
+       lvx             $rcon,0,$ptr
+       le?vxor         $key,$key,$mask         # adjust for byte swap
+       lvx             $mask,r8,$ptr
+       addi            $ptr,$ptr,0x10
+       vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
+       li              $cnt,8
+       vxor            $zero,$zero,$zero
+       mtctr           $cnt
+
+       ?lvsr           $outperm,0,$out
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$zero,$outmask,$outperm
+
+       blt             Loop128
+       addi            $inp,$inp,8
+       beq             L192
+       addi            $inp,$inp,8
+       b               L256
+
+.align 4
+Loop128:
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+       bdnz            Loop128
+
+       lvx             $rcon,0,$ptr            # last two round keys
+
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vxor            $in0,$in0,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+
+       addi            $inp,$out,15            # 15 is not typo
+       addi            $out,$out,0x50
+
+       li              $rounds,10
+       b               Ldone
+
+.align 4
+L192:
+       lvx             $tmp,0,$inp
+       li              $cnt,4
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
+       vspltisb        $key,8                  # borrow $key
+       mtctr           $cnt
+       vsububm         $mask,$mask,$key        # adjust the mask
+
+Loop192:
+       vperm           $key,$in1,$in1,$mask    # roate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+       vcipherlast     $key,$key,$rcon
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+
+        vsldoi         $stage,$zero,$in1,8
+       vspltw          $tmp,$in0,3
+       vxor            $tmp,$tmp,$in1
+       vsldoi          $in1,$zero,$in1,12      # >>32
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in1,$in1,$tmp
+       vxor            $in0,$in0,$key
+       vxor            $in1,$in1,$key
+        vsldoi         $stage,$stage,$in0,8
+
+       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$stage,$stage,$outperm # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+        vsldoi         $stage,$in0,$in1,8
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+        vperm          $outtail,$stage,$stage,$outperm # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vspltw          $tmp,$in0,3
+       vxor            $tmp,$tmp,$in1
+       vsldoi          $in1,$zero,$in1,12      # >>32
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in1,$in1,$tmp
+       vxor            $in0,$in0,$key
+       vxor            $in1,$in1,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $inp,$out,15            # 15 is not typo
+        addi           $out,$out,16
+       bdnz            Loop192
+
+       li              $rounds,12
+       addi            $out,$out,0x20
+       b               Ldone
+
+.align 4
+L256:
+       lvx             $tmp,0,$inp
+       li              $cnt,7
+       li              $rounds,14
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
+       mtctr           $cnt
+
+Loop256:
+       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in1,$in1,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $inp,$out,15            # 15 is not typo
+        addi           $out,$out,16
+       bdz             Ldone
+
+       vspltw          $key,$in0,3             # just splat
+       vsldoi          $tmp,$zero,$in1,12      # >>32
+       vsbox           $key,$key
+
+       vxor            $in1,$in1,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in1,$in1,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in1,$in1,$tmp
+
+       vxor            $in1,$in1,$key
+       b               Loop256
+
+.align 4
+Ldone:
+       lvx             $in1,0,$inp             # redundant in aligned case
+       vsel            $in1,$outhead,$in1,$outmask
+       stvx            $in1,0,$inp
+       li              $ptr,0
+       mtspr           256,$vrsave
+       stw             $rounds,0($out)
+
+Lenc_key_abort:
+       mr              r3,$ptr
+       blr
+       .long           0
+       .byte           0,12,0x14,1,0,0,3,0
+       .long           0
+.size  .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+       $STU            $sp,-$FRAME($sp)
+       mflr            r10
+       $PUSH           r10,$FRAME+$LRSAVE($sp)
+       bl              Lset_encrypt_key
+       mtlr            r10
+
+       cmpwi           r3,0
+       bne-            Ldec_key_abort
+
+       slwi            $cnt,$rounds,4
+       subi            $inp,$out,240           # first round key
+       srwi            $rounds,$rounds,1
+       add             $out,$inp,$cnt          # last round key
+       mtctr           $rounds
+
+Ldeckey:
+       lwz             r0, 0($inp)
+       lwz             r6, 4($inp)
+       lwz             r7, 8($inp)
+       lwz             r8, 12($inp)
+       addi            $inp,$inp,16
+       lwz             r9, 0($out)
+       lwz             r10,4($out)
+       lwz             r11,8($out)
+       lwz             r12,12($out)
+       stw             r0, 0($out)
+       stw             r6, 4($out)
+       stw             r7, 8($out)
+       stw             r8, 12($out)
+       subi            $out,$out,16
+       stw             r9, -16($inp)
+       stw             r10,-12($inp)
+       stw             r11,-8($inp)
+       stw             r12,-4($inp)
+       bdnz            Ldeckey
+
+       xor             r3,r3,r3                # return value
+Ldec_key_abort:
+       addi            $sp,$sp,$FRAME
+       blr
+       .long           0
+       .byte           0,12,4,1,0x80,0,3,0
+       .long           0
+.size  .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{    # Single block en- and decrypt procedures                       #
+sub gen_block () {
+my $dir = shift;
+my $n   = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_${dir}crypt
+       lwz             $rounds,240($key)
+       lis             r0,0xfc00
+       mfspr           $vrsave,256
+       li              $idx,15                 # 15 is not typo
+       mtspr           256,r0
+
+       lvx             v0,0,$inp
+       neg             r11,$out
+       lvx             v1,$idx,$inp
+       lvsl            v2,0,$inp               # inpperm
+       le?vspltisb     v4,0x0f
+       ?lvsl           v3,0,r11                # outperm
+       le?vxor         v2,v2,v4
+       li              $idx,16
+       vperm           v0,v0,v1,v2             # align [and byte swap in LE]
+       lvx             v1,0,$key
+       ?lvsl           v5,0,$key               # keyperm
+       srwi            $rounds,$rounds,1
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       subi            $rounds,$rounds,1
+       ?vperm          v1,v1,v2,v5             # align round key
+
+       vxor            v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+Loop_${dir}c:
+       ?vperm          v2,v2,v1,v5
+       v${n}cipher     v0,v0,v2
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          v1,v1,v2,v5
+       v${n}cipher     v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_${dir}c
+
+       ?vperm          v2,v2,v1,v5
+       v${n}cipher     v0,v0,v2
+       lvx             v2,$idx,$key
+       ?vperm          v1,v1,v2,v5
+       v${n}cipherlast v0,v0,v1
+
+       vspltisb        v2,-1
+       vxor            v1,v1,v1
+       li              $idx,15                 # 15 is not typo
+       ?vperm          v2,v1,v2,v3             # outmask
+       le?vxor         v3,v3,v4
+       lvx             v1,0,$out               # outhead
+       vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
+       vsel            v1,v1,v0,v2
+       lvx             v4,$idx,$out
+       stvx            v1,0,$out
+       vsel            v0,v0,v4,v2
+       stvx            v0,$idx,$out
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,3,0
+       .long           0
+.size  .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+       # constants table endian-specific conversion
+       if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+           my $conv=$3;
+           my @bytes=();
+
+           # convert to endian-agnostic format
+           if ($1 eq "long") {
+             foreach (split(/,\s*/,$2)) {
+               my $l = /^0/?oct:int;
+               push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+             }
+           } else {
+               @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+           }
+
+           # little-endian conversion
+           if ($flavour =~ /le$/o) {
+               SWITCH: for($conv)  {
+                   /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
+                   /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
+               }
+           }
+
+           #emit
+           print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+           next;
+       }
+       $consts=0 if (m/Lconsts:/o);    # end of table
+
+       # instructions prefixed with '?' are endian-specific and need
+       # to be adjusted accordingly...
+       if ($flavour =~ /le$/o) {       # little-endian
+           s/le\?//o           or
+           s/be\?/#be#/o       or
+           s/\?lvsr/lvsl/o     or
+           s/\?lvsl/lvsr/o     or
+           s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+           s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+           s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+       } else {                        # big-endian
+           s/le\?/#le#/o       or
+           s/be\?//o           or
+           s/\?([a-z]+)/$1/o;
+       }
+
+        print $_,"\n";
+}
+
+close STDOUT;