From 569a3cb9172744b0ba3d329cb83d6e1a0e4f4d41 Mon Sep 17 00:00:00 2001
From: Tomas Mraz <tmraz@fedoraproject.org>
Date: May 06 2019 09:07:12 +0000
Subject: add S390x chacha20-poly1305 assembler support from master branch


---

diff --git a/openssl-1.1.1-s390x-update.patch b/openssl-1.1.1-s390x-update.patch
new file mode 100644
index 0000000..f82b0ec
--- /dev/null
+++ b/openssl-1.1.1-s390x-update.patch
@@ -0,0 +1,5591 @@
+diff -up openssl-1.1.1b/crypto/chacha/asm/chacha-s390x.pl.s390x-update openssl-1.1.1b/crypto/chacha/asm/chacha-s390x.pl
+--- openssl-1.1.1b/crypto/chacha/asm/chacha-s390x.pl.s390x-update	2019-02-26 15:15:30.000000000 +0100
++++ openssl-1.1.1b/crypto/chacha/asm/chacha-s390x.pl	2019-05-06 10:59:30.859784823 +0200
+@@ -1,5 +1,5 @@
+ #! /usr/bin/env perl
+-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
++# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
+ #
+ # Licensed under the OpenSSL license (the "License").  You may not use
+ # this file except in compliance with the License.  You can obtain a copy
+@@ -20,41 +20,53 @@
+ #
+ # 3 times faster than compiler-generated code.
+ 
+-$flavour = shift;
++#
++# August 2018
++#
++# Add vx code path: 4x"vertical".
++#
++# Copyright IBM Corp. 2018
++# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
++
++#
++# February 2019
++#
++# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
++# 4x"vertical" submission [on z13] and >3 faster than scalar code.
++# But to harness overheads revert to transliteration of VSX code path
++# from chacha-ppc module, which is also 4x"vertical", to handle inputs
++# not longer than 256 bytes.
++
++use strict;
++use FindBin qw($Bin);
++use lib "$Bin/../..";
++use perlasm::s390x qw(:DEFAULT :VX :LD AUTOLOAD LABEL INCLUDE);
+ 
++my $flavour = shift;
++
++my ($z,$SIZE_T);
+ if ($flavour =~ /3[12]/) {
++	$z=0;	# S/390 ABI
+ 	$SIZE_T=4;
+-	$g="";
+ } else {
++	$z=1;	# zSeries ABI
+ 	$SIZE_T=8;
+-	$g="g";
+ }
+ 
++my $output;
+ while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+-open STDOUT,">$output";
+-
+-sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
+-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+-    $code .= "\t$opcode\t".join(',',@_)."\n";
+-}
+ 
+ my $sp="%r15";
+-
+ my $stdframe=16*$SIZE_T+4*8;
+-my $frame=$stdframe+4*20;
+-
+-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+ 
++sub ROUND {
+ my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
+ my @t=map("%r$_",(8,9));
+-
+-sub ROUND {
+ my ($a0,$b0,$c0,$d0)=@_;
+ my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+ my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+-my ($xc,$xc_)=map("\"$_\"",@t);
+-my @x=map("\"$_\"",@x);
++my ($xc,$xc_)=map("$_",@t);
+ 
+ 	# Consider order in which variables are addressed by their
+ 	# index:
+@@ -78,249 +90,967 @@ my @x=map("\"$_\"",@x);
+ 	# 'c' stores and loads in the middle, but none in the beginning
+ 	# or end.
+ 
+-	(
+-	"&alr	(@x[$a0],@x[$b0])",	# Q1
+-	 "&alr	(@x[$a1],@x[$b1])",	# Q2
+-	"&xr	(@x[$d0],@x[$a0])",
+-	 "&xr	(@x[$d1],@x[$a1])",
+-	"&rll	(@x[$d0],@x[$d0],16)",
+-	 "&rll	(@x[$d1],@x[$d1],16)",
+-
+-	"&alr	($xc,@x[$d0])",
+-	 "&alr	($xc_,@x[$d1])",
+-	"&xr	(@x[$b0],$xc)",
+-	 "&xr	(@x[$b1],$xc_)",
+-	"&rll	(@x[$b0],@x[$b0],12)",
+-	 "&rll	(@x[$b1],@x[$b1],12)",
+-
+-	"&alr	(@x[$a0],@x[$b0])",
+-	 "&alr	(@x[$a1],@x[$b1])",
+-	"&xr	(@x[$d0],@x[$a0])",
+-	 "&xr	(@x[$d1],@x[$a1])",
+-	"&rll	(@x[$d0],@x[$d0],8)",
+-	 "&rll	(@x[$d1],@x[$d1],8)",
+-
+-	"&alr	($xc,@x[$d0])",
+-	 "&alr	($xc_,@x[$d1])",
+-	"&xr	(@x[$b0],$xc)",
+-	 "&xr	(@x[$b1],$xc_)",
+-	"&rll	(@x[$b0],@x[$b0],7)",
+-	 "&rll	(@x[$b1],@x[$b1],7)",
+-
+-	"&stm	($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')",	# reload pair of 'c's
+-	"&lm	($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
+-
+-	"&alr	(@x[$a2],@x[$b2])",	# Q3
+-	 "&alr	(@x[$a3],@x[$b3])",	# Q4
+-	"&xr	(@x[$d2],@x[$a2])",
+-	 "&xr	(@x[$d3],@x[$a3])",
+-	"&rll	(@x[$d2],@x[$d2],16)",
+-	 "&rll	(@x[$d3],@x[$d3],16)",
+-
+-	"&alr	($xc,@x[$d2])",
+-	 "&alr	($xc_,@x[$d3])",
+-	"&xr	(@x[$b2],$xc)",
+-	 "&xr	(@x[$b3],$xc_)",
+-	"&rll	(@x[$b2],@x[$b2],12)",
+-	 "&rll	(@x[$b3],@x[$b3],12)",
+-
+-	"&alr	(@x[$a2],@x[$b2])",
+-	 "&alr	(@x[$a3],@x[$b3])",
+-	"&xr	(@x[$d2],@x[$a2])",
+-	 "&xr	(@x[$d3],@x[$a3])",
+-	"&rll	(@x[$d2],@x[$d2],8)",
+-	 "&rll	(@x[$d3],@x[$d3],8)",
+-
+-	"&alr	($xc,@x[$d2])",
+-	 "&alr	($xc_,@x[$d3])",
+-	"&xr	(@x[$b2],$xc)",
+-	 "&xr	(@x[$b3],$xc_)",
+-	"&rll	(@x[$b2],@x[$b2],7)",
+-	 "&rll	(@x[$b3],@x[$b3],7)"
+-	);
+-}
+-
+-$code.=<<___;
+-.text
+-
+-.globl	ChaCha20_ctr32
+-.type	ChaCha20_ctr32,\@function
+-.align	32
+-ChaCha20_ctr32:
+-	lt${g}r	$len,$len			# $len==0?
+-	bzr	%r14
+-	a${g}hi	$len,-64
+-	l${g}hi	%r1,-$frame
+-	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
+-	sl${g}r	$out,$inp			# difference
+-	la	$len,0($inp,$len)		# end of input minus 64
+-	larl	%r7,.Lsigma
+-	lgr	%r0,$sp
+-	la	$sp,0(%r1,$sp)
+-	st${g}	%r0,0($sp)
+-
+-	lmg	%r8,%r11,0($key)		# load key
+-	lmg	%r12,%r13,0($counter)		# load counter
+-	lmg	%r6,%r7,0(%r7)			# load sigma constant
+-
+-	la	%r14,0($inp)
+-	st${g}	$out,$frame+3*$SIZE_T($sp)
+-	st${g}	$len,$frame+4*$SIZE_T($sp)
+-	stmg	%r6,%r13,$stdframe($sp)		# copy key schedule to stack
+-	srlg	@x[12],%r12,32			# 32-bit counter value
+-	j	.Loop_outer
+-
+-.align	16
+-.Loop_outer:
+-	lm	@x[0],@x[7],$stdframe+4*0($sp)		# load x[0]-x[7]
+-	lm	@t[0],@t[1],$stdframe+4*10($sp)		# load x[10]-x[11]
+-	lm	@x[13],@x[15],$stdframe+4*13($sp)	# load x[13]-x[15]
+-	stm	@t[0],@t[1],$stdframe+4*8+4*10($sp)	# offload x[10]-x[11]
+-	lm	@t[0],@t[1],$stdframe+4*8($sp)		# load x[8]-x[9]
+-	st	@x[12],$stdframe+4*12($sp)		# save counter
+-	st${g}	%r14,$frame+2*$SIZE_T($sp)		# save input pointer
+-	lhi	%r14,10
+-	j	.Loop
+-
+-.align	4
+-.Loop:
+-___
+-	foreach (&ROUND(0, 4, 8,12)) { eval; }
+-	foreach (&ROUND(0, 5,10,15)) { eval; }
+-$code.=<<___;
+-	brct	%r14,.Loop
+-
+-	l${g}	%r14,$frame+2*$SIZE_T($sp)		# pull input pointer
+-	stm	@t[0],@t[1],$stdframe+4*8+4*8($sp)	# offload x[8]-x[9]
+-	lm${g}	@t[0],@t[1],$frame+3*$SIZE_T($sp)
+-
+-	al	@x[0],$stdframe+4*0($sp)	# accumulate key schedule
+-	al	@x[1],$stdframe+4*1($sp)
+-	al	@x[2],$stdframe+4*2($sp)
+-	al	@x[3],$stdframe+4*3($sp)
+-	al	@x[4],$stdframe+4*4($sp)
+-	al	@x[5],$stdframe+4*5($sp)
+-	al	@x[6],$stdframe+4*6($sp)
+-	al	@x[7],$stdframe+4*7($sp)
+-	lrvr	@x[0],@x[0]
+-	lrvr	@x[1],@x[1]
+-	lrvr	@x[2],@x[2]
+-	lrvr	@x[3],@x[3]
+-	lrvr	@x[4],@x[4]
+-	lrvr	@x[5],@x[5]
+-	lrvr	@x[6],@x[6]
+-	lrvr	@x[7],@x[7]
+-	al	@x[12],$stdframe+4*12($sp)
+-	al	@x[13],$stdframe+4*13($sp)
+-	al	@x[14],$stdframe+4*14($sp)
+-	al	@x[15],$stdframe+4*15($sp)
+-	lrvr	@x[12],@x[12]
+-	lrvr	@x[13],@x[13]
+-	lrvr	@x[14],@x[14]
+-	lrvr	@x[15],@x[15]
+-
+-	la	@t[0],0(@t[0],%r14)		# reconstruct output pointer
+-	cl${g}r	%r14,@t[1]
+-	jh	.Ltail
+-
+-	x	@x[0],4*0(%r14)			# xor with input
+-	x	@x[1],4*1(%r14)
+-	st	@x[0],4*0(@t[0])		# store output
+-	x	@x[2],4*2(%r14)
+-	st	@x[1],4*1(@t[0])
+-	x	@x[3],4*3(%r14)
+-	st	@x[2],4*2(@t[0])
+-	x	@x[4],4*4(%r14)
+-	st	@x[3],4*3(@t[0])
+-	 lm	@x[0],@x[3],$stdframe+4*8+4*8($sp)	# load x[8]-x[11]
+-	x	@x[5],4*5(%r14)
+-	st	@x[4],4*4(@t[0])
+-	x	@x[6],4*6(%r14)
+-	 al	@x[0],$stdframe+4*8($sp)
+-	st	@x[5],4*5(@t[0])
+-	x	@x[7],4*7(%r14)
+-	 al	@x[1],$stdframe+4*9($sp)
+-	st	@x[6],4*6(@t[0])
+-	x	@x[12],4*12(%r14)
+-	 al	@x[2],$stdframe+4*10($sp)
+-	st	@x[7],4*7(@t[0])
+-	x	@x[13],4*13(%r14)
+-	 al	@x[3],$stdframe+4*11($sp)
+-	st	@x[12],4*12(@t[0])
+-	x	@x[14],4*14(%r14)
+-	st	@x[13],4*13(@t[0])
+-	x	@x[15],4*15(%r14)
+-	st	@x[14],4*14(@t[0])
+-	 lrvr	@x[0],@x[0]
+-	st	@x[15],4*15(@t[0])
+-	 lrvr	@x[1],@x[1]
+-	 lrvr	@x[2],@x[2]
+-	 lrvr	@x[3],@x[3]
+-	lhi	@x[12],1
+-	 x	@x[0],4*8(%r14)
+-	al	@x[12],$stdframe+4*12($sp)	# increment counter
+-	 x	@x[1],4*9(%r14)
+-	 st	@x[0],4*8(@t[0])
+-	 x	@x[2],4*10(%r14)
+-	 st	@x[1],4*9(@t[0])
+-	 x	@x[3],4*11(%r14)
+-	 st	@x[2],4*10(@t[0])
+-	 st	@x[3],4*11(@t[0])
+-
+-	cl${g}r	%r14,@t[1]			# done yet?
+-	la	%r14,64(%r14)
+-	jl	.Loop_outer
+-
+-.Ldone:
+-	xgr	%r0,%r0
+-	xgr	%r1,%r1
+-	xgr	%r2,%r2
+-	xgr	%r3,%r3
+-	stmg	%r0,%r3,$stdframe+4*4($sp)	# wipe key copy
+-	stmg	%r0,%r3,$stdframe+4*12($sp)
+-
+-	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
+-	br	%r14
+-
+-.align	16
+-.Ltail:
+-	la	@t[1],64($t[1])
+-	stm	@x[0],@x[7],$stdframe+4*0($sp)
+-	sl${g}r	@t[1],%r14
+-	lm	@x[0],@x[3],$stdframe+4*8+4*8($sp)
+-	l${g}hi	@x[6],0
+-	stm	@x[12],@x[15],$stdframe+4*12($sp)
+-	al	@x[0],$stdframe+4*8($sp)
+-	al	@x[1],$stdframe+4*9($sp)
+-	al	@x[2],$stdframe+4*10($sp)
+-	al	@x[3],$stdframe+4*11($sp)
+-	lrvr	@x[0],@x[0]
+-	lrvr	@x[1],@x[1]
+-	lrvr	@x[2],@x[2]
+-	lrvr	@x[3],@x[3]
+-	stm	@x[0],@x[3],$stdframe+4*8($sp)
+-
+-.Loop_tail:
+-	llgc	@x[4],0(@x[6],%r14)
+-	llgc	@x[5],$stdframe(@x[6],$sp)
+-	xr	@x[5],@x[4]
+-	stc	@x[5],0(@x[6],@t[0])
+-	la	@x[6],1(@x[6])
+-	brct	@t[1],.Loop_tail
+-
+-	j	.Ldone
+-.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+-
+-.align	32
+-.Lsigma:
+-.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
+-.asciz	"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+-.align	4
+-___
++	alr	(@x[$a0],@x[$b0]);	# Q1
++	 alr	(@x[$a1],@x[$b1]);	# Q2
++	xr	(@x[$d0],@x[$a0]);
++	 xr	(@x[$d1],@x[$a1]);
++	rll	(@x[$d0],@x[$d0],16);
++	 rll	(@x[$d1],@x[$d1],16);
++
++	alr	($xc,@x[$d0]);
++	 alr	($xc_,@x[$d1]);
++	xr	(@x[$b0],$xc);
++	 xr	(@x[$b1],$xc_);
++	rll	(@x[$b0],@x[$b0],12);
++	 rll	(@x[$b1],@x[$b1],12);
++
++	alr	(@x[$a0],@x[$b0]);
++	 alr	(@x[$a1],@x[$b1]);
++	xr	(@x[$d0],@x[$a0]);
++	 xr	(@x[$d1],@x[$a1]);
++	rll	(@x[$d0],@x[$d0],8);
++	 rll	(@x[$d1],@x[$d1],8);
++
++	alr	($xc,@x[$d0]);
++	 alr	($xc_,@x[$d1]);
++	xr	(@x[$b0],$xc);
++	 xr	(@x[$b1],$xc_);
++	rll	(@x[$b0],@x[$b0],7);
++	 rll	(@x[$b1],@x[$b1],7);
++
++	stm	($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");	# reload pair of 'c's
++	lm	($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
++
++	alr	(@x[$a2],@x[$b2]);	# Q3
++	 alr	(@x[$a3],@x[$b3]);	# Q4
++	xr	(@x[$d2],@x[$a2]);
++	 xr	(@x[$d3],@x[$a3]);
++	rll	(@x[$d2],@x[$d2],16);
++	 rll	(@x[$d3],@x[$d3],16);
++
++	alr	($xc,@x[$d2]);
++	 alr	($xc_,@x[$d3]);
++	xr	(@x[$b2],$xc);
++	 xr	(@x[$b3],$xc_);
++	rll	(@x[$b2],@x[$b2],12);
++	 rll	(@x[$b3],@x[$b3],12);
++
++	alr	(@x[$a2],@x[$b2]);
++	 alr	(@x[$a3],@x[$b3]);
++	xr	(@x[$d2],@x[$a2]);
++	 xr	(@x[$d3],@x[$a3]);
++	rll	(@x[$d2],@x[$d2],8);
++	 rll	(@x[$d3],@x[$d3],8);
++
++	alr	($xc,@x[$d2]);
++	 alr	($xc_,@x[$d3]);
++	xr	(@x[$b2],$xc);
++	 xr	(@x[$b3],$xc_);
++	rll	(@x[$b2],@x[$b2],7);
++	 rll	(@x[$b3],@x[$b3],7);
++}
++
++sub VX_lane_ROUND {
++my ($a0,$b0,$c0,$d0)=@_;
++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
++my @x=map("%v$_",(0..15));
+ 
+-foreach (split("\n",$code)) {
+-	s/\`([^\`]*)\`/eval $1/ge;
++	vaf	(@x[$a0],@x[$a0],@x[$b0]);	# Q1
++	vx	(@x[$d0],@x[$d0],@x[$a0]);
++	verllf	(@x[$d0],@x[$d0],16);
++	vaf	(@x[$a1],@x[$a1],@x[$b1]);	# Q2
++	vx	(@x[$d1],@x[$d1],@x[$a1]);
++	verllf	(@x[$d1],@x[$d1],16);
++	vaf	(@x[$a2],@x[$a2],@x[$b2]);	# Q3
++	vx	(@x[$d2],@x[$d2],@x[$a2]);
++	verllf	(@x[$d2],@x[$d2],16);
++	vaf	(@x[$a3],@x[$a3],@x[$b3]);	# Q4
++	vx	(@x[$d3],@x[$d3],@x[$a3]);
++	verllf	(@x[$d3],@x[$d3],16);
++
++	vaf	(@x[$c0],@x[$c0],@x[$d0]);
++	vx	(@x[$b0],@x[$b0],@x[$c0]);
++	verllf	(@x[$b0],@x[$b0],12);
++	vaf	(@x[$c1],@x[$c1],@x[$d1]);
++	vx	(@x[$b1],@x[$b1],@x[$c1]);
++	verllf	(@x[$b1],@x[$b1],12);
++	vaf	(@x[$c2],@x[$c2],@x[$d2]);
++	vx	(@x[$b2],@x[$b2],@x[$c2]);
++	verllf	(@x[$b2],@x[$b2],12);
++	vaf	(@x[$c3],@x[$c3],@x[$d3]);
++	vx	(@x[$b3],@x[$b3],@x[$c3]);
++	verllf	(@x[$b3],@x[$b3],12);
++
++	vaf	(@x[$a0],@x[$a0],@x[$b0]);
++	vx	(@x[$d0],@x[$d0],@x[$a0]);
++	verllf	(@x[$d0],@x[$d0],8);
++	vaf	(@x[$a1],@x[$a1],@x[$b1]);
++	vx	(@x[$d1],@x[$d1],@x[$a1]);
++	verllf	(@x[$d1],@x[$d1],8);
++	vaf	(@x[$a2],@x[$a2],@x[$b2]);
++	vx	(@x[$d2],@x[$d2],@x[$a2]);
++	verllf	(@x[$d2],@x[$d2],8);
++	vaf	(@x[$a3],@x[$a3],@x[$b3]);
++	vx	(@x[$d3],@x[$d3],@x[$a3]);
++	verllf	(@x[$d3],@x[$d3],8);
++
++	vaf	(@x[$c0],@x[$c0],@x[$d0]);
++	vx	(@x[$b0],@x[$b0],@x[$c0]);
++	verllf	(@x[$b0],@x[$b0],7);
++	vaf	(@x[$c1],@x[$c1],@x[$d1]);
++	vx	(@x[$b1],@x[$b1],@x[$c1]);
++	verllf	(@x[$b1],@x[$b1],7);
++	vaf	(@x[$c2],@x[$c2],@x[$d2]);
++	vx	(@x[$b2],@x[$b2],@x[$c2]);
++	verllf	(@x[$b2],@x[$b2],7);
++	vaf	(@x[$c3],@x[$c3],@x[$d3]);
++	vx	(@x[$b3],@x[$b3],@x[$c3]);
++	verllf	(@x[$b3],@x[$b3],7);
++}
+ 
+-	print $_,"\n";
++sub VX_ROUND {
++my @a=@_[0..5];
++my @b=@_[6..11];
++my @c=@_[12..17];
++my @d=@_[18..23];
++my $odd=@_[24];
++
++	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
++	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
++	verllf		(@d[$_],@d[$_],16) for (0..5);
++
++	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
++	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
++	verllf		(@b[$_],@b[$_],12) for (0..5);
++
++	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
++	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
++	verllf		(@d[$_],@d[$_],8) for (0..5);
++
++	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
++	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
++	verllf		(@b[$_],@b[$_],7) for (0..5);
++
++	vsldb		(@c[$_],@c[$_],@c[$_],8) for (0..5);
++	vsldb		(@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
++	vsldb		(@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
+ }
+-close STDOUT;
++
++PERLASM_BEGIN($output);
++
++INCLUDE	("s390x_arch.h");
++TEXT	();
++
++################
++# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
++#                     const unsigned int key[8], const unsigned int counter[4])
++my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
++{
++my $frame=$stdframe+4*20;
++my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
++my @t=map("%r$_",(8,9));
++
++GLOBL	("ChaCha20_ctr32");
++TYPE	("ChaCha20_ctr32","\@function");
++ALIGN	(32);
++LABEL	("ChaCha20_ctr32");
++	larl	("%r1","OPENSSL_s390xcap_P");
++
++	lghi	("%r0",64);
++&{$z?	\&ltgr:\&ltr}	($len,$len);		# len==0?
++	bzr	("%r14");
++	lg	("%r1","S390X_STFLE+16(%r1)");
++&{$z?	\&clgr:\&clr}	($len,"%r0");
++	jle	(".Lshort");
++
++	tmhh	("%r1",0x4000);			# check for vx bit
++	jnz	(".LChaCha20_ctr32_vx");
++
++LABEL	(".Lshort");
++&{$z?	\&aghi:\&ahi}	($len,-64);
++&{$z?	\&lghi:\&lhi}	("%r1",-$frame);
++&{$z?	\&stmg:\&stm}	("%r6","%r15","6*$SIZE_T($sp)");
++&{$z?	\&slgr:\&slr}	($out,$inp);	# difference
++	la	($len,"0($inp,$len)");	# end of input minus 64
++	larl	("%r7",".Lsigma");
++	lgr	("%r0",$sp);
++	la	($sp,"0(%r1,$sp)");
++&{$z?	\&stg:\&st}	("%r0","0($sp)");
++
++	lmg	("%r8","%r11","0($key)");	# load key
++	lmg	("%r12","%r13","0($counter)");	# load counter
++	lmg	("%r6","%r7","0(%r7)");	# load sigma constant
++
++	la	("%r14","0($inp)");
++&{$z?	\&stg:\&st}	($out,"$frame+3*$SIZE_T($sp)");
++&{$z?	\&stg:\&st}	($len,"$frame+4*$SIZE_T($sp)");
++	stmg	("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
++	srlg	(@x[12],"%r12",32);	# 32-bit counter value
++	j	(".Loop_outer");
++
++ALIGN	(16);
++LABEL	(".Loop_outer");
++	lm	(@x[0],@x[7],"$stdframe+4*0($sp)");	# load x[0]-x[7]
++	lm	(@t[0],@t[1],"$stdframe+4*10($sp)");	# load x[10]-x[11]
++	lm	(@x[13],@x[15],"$stdframe+4*13($sp)");	# load x[13]-x[15]
++	stm	(@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
++	lm	(@t[0],@t[1],"$stdframe+4*8($sp)");	# load x[8]-x[9]
++	st	(@x[12],"$stdframe+4*12($sp)");	# save counter
++&{$z?	\&stg:\&st}	("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
++	lhi	("%r14",10);
++	j	(".Loop");
++
++ALIGN	(4);
++LABEL	(".Loop");
++	ROUND	(0, 4, 8,12);
++	ROUND	(0, 5,10,15);
++	brct	("%r14",".Loop");
++
++&{$z?	\&lg:\&l}	("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
++	stm	(@t[0],@t[1],"$stdframe+4*8+4*8($sp)");	# offload x[8]-x[9]
++&{$z?	\&lmg:\&lm}	(@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
++
++	al	(@x[0],"$stdframe+4*0($sp)");	# accumulate key schedule
++	al	(@x[1],"$stdframe+4*1($sp)");
++	al	(@x[2],"$stdframe+4*2($sp)");
++	al	(@x[3],"$stdframe+4*3($sp)");
++	al	(@x[4],"$stdframe+4*4($sp)");
++	al	(@x[5],"$stdframe+4*5($sp)");
++	al	(@x[6],"$stdframe+4*6($sp)");
++	al	(@x[7],"$stdframe+4*7($sp)");
++	lrvr	(@x[0],@x[0]);
++	lrvr	(@x[1],@x[1]);
++	lrvr	(@x[2],@x[2]);
++	lrvr	(@x[3],@x[3]);
++	lrvr	(@x[4],@x[4]);
++	lrvr	(@x[5],@x[5]);
++	lrvr	(@x[6],@x[6]);
++	lrvr	(@x[7],@x[7]);
++	al	(@x[12],"$stdframe+4*12($sp)");
++	al	(@x[13],"$stdframe+4*13($sp)");
++	al	(@x[14],"$stdframe+4*14($sp)");
++	al	(@x[15],"$stdframe+4*15($sp)");
++	lrvr	(@x[12],@x[12]);
++	lrvr	(@x[13],@x[13]);
++	lrvr	(@x[14],@x[14]);
++	lrvr	(@x[15],@x[15]);
++
++	la	(@t[0],"0(@t[0],%r14)");	# reconstruct output pointer
++&{$z?	\&clgr:\&clr}	("%r14",@t[1]);
++	jh	(".Ltail");
++
++	x	(@x[0],"4*0(%r14)");	# xor with input
++	x	(@x[1],"4*1(%r14)");
++	st	(@x[0],"4*0(@t[0])");	# store output
++	x	(@x[2],"4*2(%r14)");
++	st	(@x[1],"4*1(@t[0])");
++	x	(@x[3],"4*3(%r14)");
++	st	(@x[2],"4*2(@t[0])");
++	x	(@x[4],"4*4(%r14)");
++	st	(@x[3],"4*3(@t[0])");
++	 lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");	# load x[8]-x[11]
++	x	(@x[5],"4*5(%r14)");
++	st	(@x[4],"4*4(@t[0])");
++	x	(@x[6],"4*6(%r14)");
++	 al	(@x[0],"$stdframe+4*8($sp)");
++	st	(@x[5],"4*5(@t[0])");
++	x	(@x[7],"4*7(%r14)");
++	 al	(@x[1],"$stdframe+4*9($sp)");
++	st	(@x[6],"4*6(@t[0])");
++	x	(@x[12],"4*12(%r14)");
++	 al	(@x[2],"$stdframe+4*10($sp)");
++	st	(@x[7],"4*7(@t[0])");
++	x	(@x[13],"4*13(%r14)");
++	 al	(@x[3],"$stdframe+4*11($sp)");
++	st	(@x[12],"4*12(@t[0])");
++	x	(@x[14],"4*14(%r14)");
++	st	(@x[13],"4*13(@t[0])");
++	x	(@x[15],"4*15(%r14)");
++	st	(@x[14],"4*14(@t[0])");
++	 lrvr	(@x[0],@x[0]);
++	st	(@x[15],"4*15(@t[0])");
++	 lrvr	(@x[1],@x[1]);
++	 lrvr	(@x[2],@x[2]);
++	 lrvr	(@x[3],@x[3]);
++	lhi	(@x[12],1);
++	 x	(@x[0],"4*8(%r14)");
++	al	(@x[12],"$stdframe+4*12($sp)");	# increment counter
++	 x	(@x[1],"4*9(%r14)");
++	 st	(@x[0],"4*8(@t[0])");
++	 x	(@x[2],"4*10(%r14)");
++	 st	(@x[1],"4*9(@t[0])");
++	 x	(@x[3],"4*11(%r14)");
++	 st	(@x[2],"4*10(@t[0])");
++	 st	(@x[3],"4*11(@t[0])");
++
++&{$z?	\&clgr:\&clr}	("%r14",@t[1]);	# done yet?
++	la	("%r14","64(%r14)");
++	jl	(".Loop_outer");
++
++LABEL	(".Ldone");
++	xgr	("%r0","%r0");
++	xgr	("%r1","%r1");
++	xgr	("%r2","%r2");
++	xgr	("%r3","%r3");
++	stmg	("%r0","%r3","$stdframe+4*4($sp)");	# wipe key copy
++	stmg	("%r0","%r3","$stdframe+4*12($sp)");
++
++&{$z?	\&lmg:\&lm}	("%r6","%r15","$frame+6*$SIZE_T($sp)");
++	br	("%r14");
++
++ALIGN	(16);
++LABEL	(".Ltail");
++	la	(@t[1],"64($t[1])");
++	stm	(@x[0],@x[7],"$stdframe+4*0($sp)");
++&{$z?	\&slgr:\&slr}	(@t[1],"%r14");
++	lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
++&{$z?	\&lghi:\&lhi}	(@x[6],0);
++	stm	(@x[12],@x[15],"$stdframe+4*12($sp)");
++	al	(@x[0],"$stdframe+4*8($sp)");
++	al	(@x[1],"$stdframe+4*9($sp)");
++	al	(@x[2],"$stdframe+4*10($sp)");
++	al	(@x[3],"$stdframe+4*11($sp)");
++	lrvr	(@x[0],@x[0]);
++	lrvr	(@x[1],@x[1]);
++	lrvr	(@x[2],@x[2]);
++	lrvr	(@x[3],@x[3]);
++	stm	(@x[0],@x[3],"$stdframe+4*8($sp)");
++
++LABEL	(".Loop_tail");
++	llgc	(@x[4],"0(@x[6],%r14)");
++	llgc	(@x[5],"$stdframe(@x[6],$sp)");
++	xr	(@x[5],@x[4]);
++	stc	(@x[5],"0(@x[6],@t[0])");
++	la	(@x[6],"1(@x[6])");
++	brct	(@t[1],".Loop_tail");
++
++	j	(".Ldone");
++SIZE	("ChaCha20_ctr32",".-ChaCha20_ctr32");
++}
++
++########################################################################
++# 4x"vertical" layout minimizes amount of instructions, but pipeline
++# runs underutilized [because of vector instructions' high latency].
++# On the other hand minimum amount of data it takes to fully utilize
++# the pipeline is higher, so that effectively, short inputs would be
++# processed slower. Hence this code path targeting <=256 bytes lengths.
++#
++{
++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
++    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
++my @K=map("%v$_",(16..19));
++my $CTR="%v26";
++my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
++my $beperm="%v31";
++my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
++my $FRAME=$stdframe+4*16;
++
++ALIGN	(32);
++LABEL	("ChaCha20_ctr32_4x");
++LABEL	(".LChaCha20_ctr32_4x");
++&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
++if (!$z) {
++	std	("%f4","16*$SIZE_T+2*8($sp)");
++	std	("%f6","16*$SIZE_T+3*8($sp)");
++}
++&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
++	lgr	("%r0",$sp);
++	la	($sp,"0(%r1,$sp)");
++&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
++if ($z) {
++	std	("%f8","$stdframe+8*0($sp)");
++	std	("%f9","$stdframe+8*1($sp)");
++	std	("%f10","$stdframe+8*2($sp)");
++	std	("%f11","$stdframe+8*3($sp)");
++	std	("%f12","$stdframe+8*4($sp)");
++	std	("%f13","$stdframe+8*5($sp)");
++	std	("%f14","$stdframe+8*6($sp)");
++	std	("%f15","$stdframe+8*7($sp)");
++}
++	larl	("%r7",".Lsigma");
++	lhi	("%r0",10);
++	lhi	("%r1",0);
++
++	vl	(@K[0],"0(%r7)");		# load sigma
++	vl	(@K[1],"0($key)");		# load key
++	vl	(@K[2],"16($key)");
++	vl	(@K[3],"0($counter)");		# load counter
++
++	vl	($beperm,"0x40(%r7)");
++	vl	($xt1,"0x50(%r7)");
++	vrepf	($CTR,@K[3],0);
++	vlvgf	(@K[3],"%r1",0);		# clear @K[3].word[0]
++	vaf	($CTR,$CTR,$xt1);
++
++#LABEL	(".Loop_outer_4x");
++	vlm	($xa0,$xa3,"0x60(%r7)");	# load [smashed] sigma
++
++	vrepf	($xb0,@K[1],0);			# smash the key
++	vrepf	($xb1,@K[1],1);
++	vrepf	($xb2,@K[1],2);
++	vrepf	($xb3,@K[1],3);
++
++	vrepf	($xc0,@K[2],0);
++	vrepf	($xc1,@K[2],1);
++	vrepf	($xc2,@K[2],2);
++	vrepf	($xc3,@K[2],3);
++
++	vlr	($xd0,$CTR);
++	vrepf	($xd1,@K[3],1);
++	vrepf	($xd2,@K[3],2);
++	vrepf	($xd3,@K[3],3);
++
++LABEL	(".Loop_4x");
++	VX_lane_ROUND(0, 4, 8,12);
++	VX_lane_ROUND(0, 5,10,15);
++	brct	("%r0",".Loop_4x");
++
++	vaf	($xd0,$xd0,$CTR);
++
++	vmrhf	($xt0,$xa0,$xa1);		# transpose data
++	vmrhf	($xt1,$xa2,$xa3);
++	vmrlf	($xt2,$xa0,$xa1);
++	vmrlf	($xt3,$xa2,$xa3);
++	vpdi	($xa0,$xt0,$xt1,0b0000);
++	vpdi	($xa1,$xt0,$xt1,0b0101);
++	vpdi	($xa2,$xt2,$xt3,0b0000);
++	vpdi	($xa3,$xt2,$xt3,0b0101);
++
++	vmrhf	($xt0,$xb0,$xb1);
++	vmrhf	($xt1,$xb2,$xb3);
++	vmrlf	($xt2,$xb0,$xb1);
++	vmrlf	($xt3,$xb2,$xb3);
++	vpdi	($xb0,$xt0,$xt1,0b0000);
++	vpdi	($xb1,$xt0,$xt1,0b0101);
++	vpdi	($xb2,$xt2,$xt3,0b0000);
++	vpdi	($xb3,$xt2,$xt3,0b0101);
++
++	vmrhf	($xt0,$xc0,$xc1);
++	vmrhf	($xt1,$xc2,$xc3);
++	vmrlf	($xt2,$xc0,$xc1);
++	vmrlf	($xt3,$xc2,$xc3);
++	vpdi	($xc0,$xt0,$xt1,0b0000);
++	vpdi	($xc1,$xt0,$xt1,0b0101);
++	vpdi	($xc2,$xt2,$xt3,0b0000);
++	vpdi	($xc3,$xt2,$xt3,0b0101);
++
++	vmrhf	($xt0,$xd0,$xd1);
++	vmrhf	($xt1,$xd2,$xd3);
++	vmrlf	($xt2,$xd0,$xd1);
++	vmrlf	($xt3,$xd2,$xd3);
++	vpdi	($xd0,$xt0,$xt1,0b0000);
++	vpdi	($xd1,$xt0,$xt1,0b0101);
++	vpdi	($xd2,$xt2,$xt3,0b0000);
++	vpdi	($xd3,$xt2,$xt3,0b0101);
++
++	#vrepif	($xt0,4);
++	#vaf	($CTR,$CTR,$xt0);		# next counter value
++
++	vaf	($xa0,$xa0,@K[0]);
++	vaf	($xb0,$xb0,@K[1]);
++	vaf	($xc0,$xc0,@K[2]);
++	vaf	($xd0,$xd0,@K[3]);
++
++	vperm	($xa0,$xa0,$xa0,$beperm);
++	vperm	($xb0,$xb0,$xb0,$beperm);
++	vperm	($xc0,$xc0,$xc0,$beperm);
++	vperm	($xd0,$xd0,$xd0,$beperm);
++
++	#&{$z?	\&clgfi:\&clfi} ($len,0x40);
++	#jl	(".Ltail_4x");
++
++	vlm	($xt0,$xt3,"0($inp)");
++
++	vx	($xt0,$xt0,$xa0);
++	vx	($xt1,$xt1,$xb0);
++	vx	($xt2,$xt2,$xc0);
++	vx	($xt3,$xt3,$xd0);
++
++	vstm	($xt0,$xt3,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	#je	(".Ldone_4x");
++
++	vaf	($xa0,$xa1,@K[0]);
++	vaf	($xb0,$xb1,@K[1]);
++	vaf	($xc0,$xc1,@K[2]);
++	vaf	($xd0,$xd1,@K[3]);
++
++	vperm	($xa0,$xa0,$xa0,$beperm);
++	vperm	($xb0,$xb0,$xb0,$beperm);
++	vperm	($xc0,$xc0,$xc0,$beperm);
++	vperm	($xd0,$xd0,$xd0,$beperm);
++
++&{$z?	\&clgfi:\&clfi} ($len,0x40);
++	jl	(".Ltail_4x");
++
++	vlm	($xt0,$xt3,"0($inp)");
++
++	vx	($xt0,$xt0,$xa0);
++	vx	($xt1,$xt1,$xb0);
++	vx	($xt2,$xt2,$xc0);
++	vx	($xt3,$xt3,$xd0);
++
++	vstm	($xt0,$xt3,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_4x");
++
++	vaf	($xa0,$xa2,@K[0]);
++	vaf	($xb0,$xb2,@K[1]);
++	vaf	($xc0,$xc2,@K[2]);
++	vaf	($xd0,$xd2,@K[3]);
++
++	vperm	($xa0,$xa0,$xa0,$beperm);
++	vperm	($xb0,$xb0,$xb0,$beperm);
++	vperm	($xc0,$xc0,$xc0,$beperm);
++	vperm	($xd0,$xd0,$xd0,$beperm);
++
++&{$z?	\&clgfi:\&clfi} ($len,0x40);
++	jl	(".Ltail_4x");
++
++	vlm	($xt0,$xt3,"0($inp)");
++
++	vx	($xt0,$xt0,$xa0);
++	vx	($xt1,$xt1,$xb0);
++	vx	($xt2,$xt2,$xc0);
++	vx	($xt3,$xt3,$xd0);
++
++	vstm	($xt0,$xt3,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_4x");
++
++	vaf	($xa0,$xa3,@K[0]);
++	vaf	($xb0,$xb3,@K[1]);
++	vaf	($xc0,$xc3,@K[2]);
++	vaf	($xd0,$xd3,@K[3]);
++
++	vperm	($xa0,$xa0,$xa0,$beperm);
++	vperm	($xb0,$xb0,$xb0,$beperm);
++	vperm	($xc0,$xc0,$xc0,$beperm);
++	vperm	($xd0,$xd0,$xd0,$beperm);
++
++&{$z?	\&clgfi:\&clfi} ($len,0x40);
++	jl	(".Ltail_4x");
++
++	vlm	($xt0,$xt3,"0($inp)");
++
++	vx	($xt0,$xt0,$xa0);
++	vx	($xt1,$xt1,$xb0);
++	vx	($xt2,$xt2,$xc0);
++	vx	($xt3,$xt3,$xd0);
++
++	vstm	($xt0,$xt3,"0($out)");
++
++	#la	$inp,0x40($inp));
++	#la	$out,0x40($out));
++	#lhi	%r0,10);
++	#&{$z?	\&aghi:\&ahi}	$len,-0x40);
++	#jne	.Loop_outer_4x);
++
++LABEL	(".Ldone_4x");
++if (!$z) {
++	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
++	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
++} else {
++	ld	("%f8","$stdframe+8*0($sp)");
++	ld	("%f9","$stdframe+8*1($sp)");
++	ld	("%f10","$stdframe+8*2($sp)");
++	ld	("%f11","$stdframe+8*3($sp)");
++	ld	("%f12","$stdframe+8*4($sp)");
++	ld	("%f13","$stdframe+8*5($sp)");
++	ld	("%f14","$stdframe+8*6($sp)");
++	ld	("%f15","$stdframe+8*7($sp)");
++}
++&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
++	la	($sp,"$FRAME($sp)");
++	br	("%r14");
++
++ALIGN	(16);
++LABEL	(".Ltail_4x");
++if (!$z) {
++	vlr	($xt0,$xb0);
++	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
++	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
++
++	vst	($xa0,"$stdframe+0x00($sp)");
++	vst	($xt0,"$stdframe+0x10($sp)");
++	vst	($xc0,"$stdframe+0x20($sp)");
++	vst	($xd0,"$stdframe+0x30($sp)");
++} else {
++	vlr	($xt0,$xc0);
++	ld	("%f8","$stdframe+8*0($sp)");
++	ld	("%f9","$stdframe+8*1($sp)");
++	ld	("%f10","$stdframe+8*2($sp)");
++	ld	("%f11","$stdframe+8*3($sp)");
++	vlr	($xt1,$xd0);
++	ld	("%f12","$stdframe+8*4($sp)");
++	ld	("%f13","$stdframe+8*5($sp)");
++	ld	("%f14","$stdframe+8*6($sp)");
++	ld	("%f15","$stdframe+8*7($sp)");
++
++	vst	($xa0,"$stdframe+0x00($sp)");
++	vst	($xb0,"$stdframe+0x10($sp)");
++	vst	($xt0,"$stdframe+0x20($sp)");
++	vst	($xt1,"$stdframe+0x30($sp)");
++}
++	lghi	("%r1",0);
++
++LABEL	(".Loop_tail_4x");
++	llgc	("%r5","0(%r1,$inp)");
++	llgc	("%r6","$stdframe(%r1,$sp)");
++	xr	("%r6","%r5");
++	stc	("%r6","0(%r1,$out)");
++	la	("%r1","1(%r1)");
++	brct	($len,".Loop_tail_4x");
++
++&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
++	la	($sp,"$FRAME($sp)");
++	br	("%r14");
++SIZE	("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
++}
++
++########################################################################
++# 6x"horizontal" layout is optimal fit for the platform in its current
++# shape, more specifically for given vector instructions' latency. Well,
++# computational part of 8x"vertical" would be faster, but it consumes
++# all registers and dealing with that will diminish the return...
++#
++{
++my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
++    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
++    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
++my @K=map("%v$_",(27,24..26));
++my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
++my $beperm="%v31";
++my $FRAME=$stdframe + 4*16;
++
++GLOBL	("ChaCha20_ctr32_vx");
++ALIGN	(32);
++LABEL	("ChaCha20_ctr32_vx");
++LABEL	(".LChaCha20_ctr32_vx");
++&{$z?	\&clgfi:\&clfi}	($len,256);
++	jle	(".LChaCha20_ctr32_4x");
++&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
++if (!$z) {
++	std	("%f4","16*$SIZE_T+2*8($sp)");
++	std	("%f6","16*$SIZE_T+3*8($sp)");
++}
++&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
++	lgr	("%r0",$sp);
++	la	($sp,"0(%r1,$sp)");
++&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
++if ($z) {
++	std	("%f8","$FRAME-8*8($sp)");
++	std	("%f9","$FRAME-8*7($sp)");
++	std	("%f10","$FRAME-8*6($sp)");
++	std	("%f11","$FRAME-8*5($sp)");
++	std	("%f12","$FRAME-8*4($sp)");
++	std	("%f13","$FRAME-8*3($sp)");
++	std	("%f14","$FRAME-8*2($sp)");
++	std	("%f15","$FRAME-8*1($sp)");
++}
++	larl	("%r7",".Lsigma");
++	lhi	("%r0",10);
++
++	vlm	(@K[1],@K[2],"0($key)");	# load key
++	vl	(@K[3],"0($counter)");		# load counter
++
++	vlm	(@K[0],"$beperm","0(%r7)");	# load sigma, increments, ...
++
++LABEL	(".Loop_outer_vx");
++	vlr	($a0,@K[0]);
++	vlr	($b0,@K[1]);
++	vlr	($a1,@K[0]);
++	vlr	($b1,@K[1]);
++	vlr	($a2,@K[0]);
++	vlr	($b2,@K[1]);
++	vlr	($a3,@K[0]);
++	vlr	($b3,@K[1]);
++	vlr	($a4,@K[0]);
++	vlr	($b4,@K[1]);
++	vlr	($a5,@K[0]);
++	vlr	($b5,@K[1]);
++
++	vlr	($d0,@K[3]);
++	vaf	($d1,@K[3],$t1);		# K[3]+1
++	vaf	($d2,@K[3],$t2);		# K[3]+2
++	vaf	($d3,@K[3],$t3);		# K[3]+3
++	vaf	($d4,$d2,$t2);			# K[3]+4
++	vaf	($d5,$d2,$t3);			# K[3]+5
++
++	vlr	($c0,@K[2]);
++	vlr	($c1,@K[2]);
++	vlr	($c2,@K[2]);
++	vlr	($c3,@K[2]);
++	vlr	($c4,@K[2]);
++	vlr	($c5,@K[2]);
++
++	vlr	($t1,$d1);
++	vlr	($t2,$d2);
++	vlr	($t3,$d3);
++
++ALIGN	(4);
++LABEL	(".Loop_vx");
++
++	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
++		 $b0,$b1,$b2,$b3,$b4,$b5,
++		 $c0,$c1,$c2,$c3,$c4,$c5,
++		 $d0,$d1,$d2,$d3,$d4,$d5,
++		 0);
++
++	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
++		 $b0,$b1,$b2,$b3,$b4,$b5,
++		 $c0,$c1,$c2,$c3,$c4,$c5,
++		 $d0,$d1,$d2,$d3,$d4,$d5,
++		 1);
++
++	brct	("%r0",".Loop_vx");
++
++	vaf	($a0,$a0,@K[0]);
++	vaf	($b0,$b0,@K[1]);
++	vaf	($c0,$c0,@K[2]);
++	vaf	($d0,$d0,@K[3]);
++	vaf	($a1,$a1,@K[0]);
++	vaf	($d1,$d1,$t1);			# +K[3]+1
++
++	vperm	($a0,$a0,$a0,$beperm);
++	vperm	($b0,$b0,$b0,$beperm);
++	vperm	($c0,$c0,$c0,$beperm);
++	vperm	($d0,$d0,$d0,$beperm);
++
++&{$z?	\&clgfi:\&clfi}	($len,0x40);
++	jl	(".Ltail_vx");
++
++	vaf	($d2,$d2,$t2);			# +K[3]+2
++	vaf	($d3,$d3,$t3);			# +K[3]+3
++	vlm	($t0,$t3,"0($inp)");
++
++	vx	($a0,$a0,$t0);
++	vx	($b0,$b0,$t1);
++	vx	($c0,$c0,$t2);
++	vx	($d0,$d0,$t3);
++
++	vlm	(@K[0],$t3,"0(%r7)");		# re-load sigma and increments
++
++	vstm	($a0,$d0,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_vx");
++
++	vaf	($b1,$b1,@K[1]);
++	vaf	($c1,$c1,@K[2]);
++
++	vperm	($a0,$a1,$a1,$beperm);
++	vperm	($b0,$b1,$b1,$beperm);
++	vperm	($c0,$c1,$c1,$beperm);
++	vperm	($d0,$d1,$d1,$beperm);
++
++&{$z?	\&clgfi:\&clfi} ($len,0x40);
++	jl	(".Ltail_vx");
++
++	vlm	($a1,$d1,"0($inp)");
++
++	vx	($a0,$a0,$a1);
++	vx	($b0,$b0,$b1);
++	vx	($c0,$c0,$c1);
++	vx	($d0,$d0,$d1);
++
++	vstm	($a0,$d0,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_vx");
++
++	vaf	($a2,$a2,@K[0]);
++	vaf	($b2,$b2,@K[1]);
++	vaf	($c2,$c2,@K[2]);
++
++	vperm	($a0,$a2,$a2,$beperm);
++	vperm	($b0,$b2,$b2,$beperm);
++	vperm	($c0,$c2,$c2,$beperm);
++	vperm	($d0,$d2,$d2,$beperm);
++
++&{$z?	\&clgfi:\&clfi}	($len,0x40);
++	jl	(".Ltail_vx");
++
++	vlm	($a1,$d1,"0($inp)");
++
++	vx	($a0,$a0,$a1);
++	vx	($b0,$b0,$b1);
++	vx	($c0,$c0,$c1);
++	vx	($d0,$d0,$d1);
++
++	vstm	($a0,$d0,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_vx");
++
++	vaf	($a3,$a3,@K[0]);
++	vaf	($b3,$b3,@K[1]);
++	vaf	($c3,$c3,@K[2]);
++	vaf	($d2,@K[3],$t3);		# K[3]+3
++
++	vperm	($a0,$a3,$a3,$beperm);
++	vperm	($b0,$b3,$b3,$beperm);
++	vperm	($c0,$c3,$c3,$beperm);
++	vperm	($d0,$d3,$d3,$beperm);
++
++&{$z?	\&clgfi:\&clfi}	($len,0x40);
++	jl	(".Ltail_vx");
++
++	vaf	($d3,$d2,$t1);			# K[3]+4
++	vlm	($a1,$d1,"0($inp)");
++
++	vx	($a0,$a0,$a1);
++	vx	($b0,$b0,$b1);
++	vx	($c0,$c0,$c1);
++	vx	($d0,$d0,$d1);
++
++	vstm	($a0,$d0,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_vx");
++
++	vaf	($a4,$a4,@K[0]);
++	vaf	($b4,$b4,@K[1]);
++	vaf	($c4,$c4,@K[2]);
++	vaf	($d4,$d4,$d3);			# +K[3]+4
++	vaf	($d3,$d3,$t1);			# K[3]+5
++	vaf	(@K[3],$d2,$t3);		# K[3]+=6
++
++	vperm	($a0,$a4,$a4,$beperm);
++	vperm	($b0,$b4,$b4,$beperm);
++	vperm	($c0,$c4,$c4,$beperm);
++	vperm	($d0,$d4,$d4,$beperm);
++
++&{$z?	\&clgfi:\&clfi}	($len,0x40);
++	jl	(".Ltail_vx");
++
++	vlm	($a1,$d1,"0($inp)");
++
++	vx	($a0,$a0,$a1);
++	vx	($b0,$b0,$b1);
++	vx	($c0,$c0,$c1);
++	vx	($d0,$d0,$d1);
++
++	vstm	($a0,$d0,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	je	(".Ldone_vx");
++
++	vaf	($a5,$a5,@K[0]);
++	vaf	($b5,$b5,@K[1]);
++	vaf	($c5,$c5,@K[2]);
++	vaf	($d5,$d5,$d3);			# +K[3]+5
++
++	vperm	($a0,$a5,$a5,$beperm);
++	vperm	($b0,$b5,$b5,$beperm);
++	vperm	($c0,$c5,$c5,$beperm);
++	vperm	($d0,$d5,$d5,$beperm);
++
++&{$z?	\&clgfi:\&clfi} ($len,0x40);
++	jl	(".Ltail_vx");
++
++	vlm	($a1,$d1,"0($inp)");
++
++	vx	($a0,$a0,$a1);
++	vx	($b0,$b0,$b1);
++	vx	($c0,$c0,$c1);
++	vx	($d0,$d0,$d1);
++
++	vstm	($a0,$d0,"0($out)");
++
++	la	($inp,"0x40($inp)");
++	la	($out,"0x40($out)");
++	lhi	("%r0",10);
++&{$z?	\&aghi:\&ahi}	($len,-0x40);
++	jne	(".Loop_outer_vx");
++
++LABEL	(".Ldone_vx");
++if (!$z) {
++	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
++	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
++} else {
++	ld	("%f8","$FRAME-8*8($sp)");
++	ld	("%f9","$FRAME-8*7($sp)");
++	ld	("%f10","$FRAME-8*6($sp)");
++	ld	("%f11","$FRAME-8*5($sp)");
++	ld	("%f12","$FRAME-8*4($sp)");
++	ld	("%f13","$FRAME-8*3($sp)");
++	ld	("%f14","$FRAME-8*2($sp)");
++	ld	("%f15","$FRAME-8*1($sp)");
++}
++&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
++	la	($sp,"$FRAME($sp)");
++	br	("%r14");
++
++ALIGN	(16);
++LABEL	(".Ltail_vx");
++if (!$z) {
++	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
++	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
++} else {
++	ld	("%f8","$FRAME-8*8($sp)");
++	ld	("%f9","$FRAME-8*7($sp)");
++	ld	("%f10","$FRAME-8*6($sp)");
++	ld	("%f11","$FRAME-8*5($sp)");
++	ld	("%f12","$FRAME-8*4($sp)");
++	ld	("%f13","$FRAME-8*3($sp)");
++	ld	("%f14","$FRAME-8*2($sp)");
++	ld	("%f15","$FRAME-8*1($sp)");
++}
++	vstm	($a0,$d0,"$stdframe($sp)");
++	lghi	("%r1",0);
++
++LABEL	(".Loop_tail_vx");
++	llgc	("%r5","0(%r1,$inp)");
++	llgc	("%r6","$stdframe(%r1,$sp)");
++	xr	("%r6","%r5");
++	stc	("%r6","0(%r1,$out)");
++	la	("%r1","1(%r1)");
++	brct	($len,".Loop_tail_vx");
++
++&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
++	la	($sp,"$FRAME($sp)");
++	br	("%r14");
++SIZE	("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
++}
++################
++
++ALIGN	(32);
++LABEL	(".Lsigma");
++LONG	(0x61707865,0x3320646e,0x79622d32,0x6b206574);	# endian-neutral sigma
++LONG	(1,0,0,0);
++LONG	(2,0,0,0);
++LONG	(3,0,0,0);
++LONG	(0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);	# byte swap
++
++LONG	(0,1,2,3);
++LONG	(0x61707865,0x61707865,0x61707865,0x61707865);	# smashed sigma
++LONG	(0x3320646e,0x3320646e,0x3320646e,0x3320646e);
++LONG	(0x79622d32,0x79622d32,0x79622d32,0x79622d32);
++LONG	(0x6b206574,0x6b206574,0x6b206574,0x6b206574);
++
++ASCIZ	("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
++ALIGN	(4);
++
++PERLASM_END();
+diff -up openssl-1.1.1b/crypto/perlasm/s390x.pm.s390x-update openssl-1.1.1b/crypto/perlasm/s390x.pm
+--- openssl-1.1.1b/crypto/perlasm/s390x.pm.s390x-update	2019-05-06 10:54:00.037367571 +0200
++++ openssl-1.1.1b/crypto/perlasm/s390x.pm	2019-05-06 10:59:30.859784823 +0200
+@@ -0,0 +1,3142 @@
++#!/usr/bin/env perl
++# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the OpenSSL license (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++# Copyright IBM Corp. 2018-2019
++# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
++
++package perlasm::s390x;
++
++use strict;
++use warnings;
++use bigint;
++use Carp qw(confess);
++use Exporter qw(import);
++
++our @EXPORT=qw(PERLASM_BEGIN PERLASM_END);
++our @EXPORT_OK=qw(AUTOLOAD LABEL INCLUDE stfle);
++our %EXPORT_TAGS=(
++	# long-displacement facility
++	LD => [qw(clgfi)],
++	# general-instruction-extension facility
++	GE => [qw(risbg)],
++	# extended-immediate facility
++	EI => [qw(lt)],
++	# miscellaneous-instruction-extensions facility 1
++	MI1 => [qw(risbgn)],
++	# message-security assist
++	MSA => [qw(kmac km kmc kimd klmd)],
++	# message-security-assist extension 4
++	MSA4 => [qw(kmf kmo pcc kmctr)],
++	# message-security-assist extension 5
++	MSA5 => [qw(ppno prno)],
++	# message-security-assist extension 8
++	MSA8 => [qw(kma)],
++	# vector facility
++	VX => [qw(vgef vgeg vgbm vzero vone vgm vgmb vgmh vgmf vgmg
++	    vl vlr vlrep vlrepb vlreph vlrepf vlrepg vleb vleh vlef vleg vleib
++	    vleih vleif vleig vlgv vlgvb vlgvh vlgvf vlgvg vllez vllezb vllezh
++	    vllezf vllezg vlm vlbb vlvg vlvgb vlvgh vlvgf vlvgg vlvgp
++	    vll vmrh vmrhb vmrhh vmrhf vmrhg vmrl vmrlb vmrlh vmrlf vmrlg vpk
++	    vpkh vpkf vpkg vpks vpksh vpksf vpksg vpkshs vpksfs vpksgs vpkls
++	    vpklsh vpklsf vpklsg vpklshs vpklsfs vpklsgs vperm vpdi vrep vrepb
++	    vreph vrepf vrepg vrepi vrepib vrepih vrepif vrepig vscef vsceg
++	    vsel vseg vsegb vsegh vsegf vst vsteb vsteh vstef vsteg vstm vstl
++	    vuph vuphb vuphh vuphf vuplh vuplhb vuplhh vuplhf vupl vuplb vuplhw
++	    vuplf vupll vupllb vupllh vupllf va vab vah vaf vag vaq vacc vaccb
++	    vacch vaccf vaccg vaccq vac vacq vaccc vacccq vn vnc vavg vavgb
++	    vavgh vavgf vavgg vavgl vavglb vavglh vavglf vavglg vcksm vec_ vecb
++	    vech vecf vecg vecl veclb veclh veclf veclg vceq vceqb vceqh vceqf
++	    vceqg vceqbs vceqhs vceqfs vceqgs vch vchb vchh vchf vchg vchbs
++	    vchhs vchfs vchgs vchl vchlb vchlh vchlf vchlg vchlbs vchlhs vchlfs
++	    vchlgs vclz vclzb vclzh vclzf vclzg vctz vctzb vctzh vctzf vctzg
++	    vx vgfm vgfmb vgfmh vgfmf vgfmg vgfma vgfmab vgfmah vgfmaf vgfmag
++	    vlc vlcb vlch vlcf vlcg vlp vlpb vlph vlpf vlpg vmx vmxb vmxh vmxf
++	    vmxg vmxl vmxlb vmxlh vmxlf vmxlg vmn vmnb vmnh vmnf vmng vmnl
++	    vmnlb vmnlh vmnlf vmnlg vmal vmalb vmalhw vmalf vmah vmahb vmahh
++	    vmahf vmalh vmalhb vmalhh vmalhf vmae vmaeb vmaeh vmaef vmale
++	    vmaleb vmaleh vmalef vmao vmaob vmaoh vmaof vmalo vmalob vmaloh
++	    vmalof vmh vmhb vmhh vmhf vmlh vmlhb vmlhh vmlhf vml vmlb vmlhw
++	    vmlf vme vmeb vmeh vmef vmle vmleb vmleh vmlef vmo vmob vmoh vmof
++	    vmlo vmlob vmloh vmlof vno vnot vo vpopct verllv verllvb verllvh
++	    verllvf verllvg verll verllb verllh verllf verllg verim verimb
++	    verimh verimf verimg veslv veslvb veslvh veslvf veslvg vesl veslb
++	    veslh veslf veslg vesrav vesravb vesravh vesravf vesravg vesra
++	    vesrab vesrah vesraf vesrag vesrlv vesrlvb vesrlvh vesrlvf vesrlvg
++	    vesrl vesrlb vesrlh vesrlf vesrlg vsl vslb vsldb vsra vsrab vsrl
++	    vsrlb vs vsb vsh vsf vsg vsq vscbi vscbib vscbih vscbif vscbig
++	    vscbiq vsbi vsbiq vsbcbi vsbcbiq vsumg vsumgh vsumgf vsumq vsumqf
++	    vsumqg vsum vsumb vsumh vtm vfae vfaeb vfaeh vfaef vfaebs vfaehs
++	    vfaefs vfaezb vfaezh vfaezf vfaezbs vfaezhs vfaezfs vfee vfeeb
++	    vfeeh vfeef vfeebs vfeehs vfeefs vfeezb vfeezh vfeezf vfeezbs
++	    vfeezhs vfeezfs vfene vfeneb vfeneh vfenef vfenebs vfenehs vfenefs
++	    vfenezb vfenezh vfenezf vfenezbs vfenezhs vfenezfs vistr vistrb
++	    vistrh vistrf vistrbs vistrhs vistrfs vstrc vstrcb vstrch vstrcf
++	    vstrcbs vstrchs vstrcfs vstrczb vstrczh vstrczf vstrczbs vstrczhs
++	    vstrczfs vfa vfadb wfadb wfc wfcdb wfk wfkdb vfce vfcedb wfcedb
++	    vfcedbs wfcedbs vfch vfchdb wfchdb vfchdbs wfchdbs vfche vfchedb
++	    wfchedb vfchedbs wfchedbs vcdg vcdgb wcdgb vcdlg vcdlgb wcdlgb vcgd
++	    vcgdb wcgdb vclgd vclgdb wclgdb vfd vfddb wfddb vfi vfidb wfidb
++	    vlde vldeb wldeb vled vledb wledb vfm vfmdb wfmdb vfma vfmadb
++	    wfmadb vfms vfmsdb wfmsdb vfpso vfpsodb wfpsodb vflcdb wflcdb
++	    vflndb wflndb vflpdb wflpdb vfsq vfsqdb wfsqdb vfs vfsdb wfsdb
++	    vftci vftcidb wftcidb)],
++	# vector-enhancements facility 1
++	VXE => [qw(vbperm vllezlf vmsl vmslg vnx vnn voc vpopctb vpopcth
++	    vpopctf vpopctg vfasb wfasb wfaxb wfcsb wfcxb wfksb wfkxb vfcesb
++	    vfcesbs wfcesb wfcesbs wfcexb wfcexbs vfchsb vfchsbs wfchsb wfchsbs
++	    wfchxb wfchxbs vfchesb vfchesbs wfchesb wfchesbs wfchexb wfchexbs
++	    vfdsb wfdsb wfdxb vfisb wfisb wfixb vfll vflls wflls wflld vflr
++	    vflrd wflrd wflrx vfmax vfmaxsb vfmaxdb wfmaxsb wfmaxdb wfmaxxb
++	    vfmin vfminsb vfmindb wfminsb wfmindb wfminxb vfmsb wfmsb wfmxb
++	    vfnma vfnms vfmasb wfmasb wfmaxb vfmssb wfmssb wfmsxb vfnmasb
++	    vfnmadb wfnmasb wfnmadb wfnmaxb vfnmssb vfnmsdb wfnmssb wfnmsdb
++	    wfnmsxb vfpsosb wfpsosb vflcsb wflcsb vflnsb wflnsb vflpsb wflpsb
++	    vfpsoxb wfpsoxb vflcxb wflcxb vflnxb wflnxb vflpxb wflpxb vfsqsb
++	    wfsqsb wfsqxb vfssb wfssb wfsxb vftcisb wftcisb wftcixb)],
++	# vector-packed-decimal facility
++	VXD => [qw(vlrlr vlrl vstrlr vstrl vap vcp vcvb vcvbg vcvd vcvdg vdp
++	    vlip vmp vmsp vpkz vpsop vrp vsdp vsrp vsp vtp vupkz)],
++);
++Exporter::export_ok_tags(qw(LD GE EI MI1 MSA MSA4 MSA5 MSA8 VX VXE VXD));
++
++our $AUTOLOAD;
++
++my $GR='(?:%r)?([0-9]|1[0-5])';
++my $VR='(?:%v)?([0-9]|1[0-9]|2[0-9]|3[0-1])';
++
++my ($file,$out);
++
++sub PERLASM_BEGIN
++{
++	($file,$out)=(shift,"");
++}
++sub PERLASM_END
++{
++	if (defined($file)) {
++		open(my $fd,'>',$file)||die("can't open $file: $!");
++		print({$fd}$out);
++		close($fd);
++	} else {
++		print($out);
++	}
++}
++
++sub AUTOLOAD {
++	confess(err("PARSE")) if (grep(!defined($_),@_));
++	my $token;
++	for ($AUTOLOAD) {
++		$token=".$1" if (/^.*::([A-Z_]+)$/);	# uppercase: directive
++		$token="\t$1" if (/^.*::([a-z]+)$/);	# lowercase: mnemonic
++		confess(err("PARSE")) if (!defined($token));
++	}
++	$token.="\t" if ($#_>=0);
++	$out.=$token.join(',',@_)."\n";
++}
++
++sub LABEL {						# label directive
++	confess(err("ARGNUM")) if ($#_!=0);
++	my ($label)=@_;
++	$out.="$label:\n";
++}
++
++sub INCLUDE {
++	confess(err("ARGNUM")) if ($#_!=0);
++	my ($file)=@_;
++	$out.="#include \"$file\"\n";
++}
++
++#
++# Mnemonics
++#
++
++sub stfle {
++	confess(err("ARGNUM")) if ($#_!=0);
++	S(0xb2b0,@_);
++}
++
++# MISC
++
++sub clgfi {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RILa(0xc2e,@_);
++}
++
++sub lt {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RXYa(0xe312,@_);
++}
++
++sub risbg {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	RIEf(0xec55,@_);
++}
++
++sub risbgn {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	RIEf(0xec59,@_);
++}
++
++# MSA
++
++sub kmac {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb91e,@_);
++}
++
++sub km {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb92e,@_);
++}
++
++sub kmc {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb92f,@_);
++}
++
++sub kimd {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb93e,@_);
++}
++
++sub klmd {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb93f,@_);
++}
++
++# MSA4
++
++sub kmf {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb92a,@_);
++}
++
++sub kmo {
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb92b,@_);
++}
++
++sub pcc {
++	confess(err("ARGNUM")) if ($#_!=-1);
++	RRE(0xb92c,@_);
++}
++
++sub kmctr {
++	confess(err("ARGNUM")) if ($#_!=2);
++	RRFb(0xb92d,@_);
++}
++
++# MSA5
++
++sub prno {
++	ppno(@_);
++}
++
++sub ppno {						# deprecated, use prno
++	confess(err("ARGNUM")) if ($#_!=1);
++	RRE(0xb93c,@_);
++}
++
++# MSA8
++
++sub kma {
++	confess(err("ARGNUM")) if ($#_!=2);
++	RRFb(0xb929,@_);
++}
++
++# VX - Support Instructions
++
++sub vgef {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRV(0xe713,@_);
++}
++sub vgeg {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRV(0xe712,@_);
++}
++
++sub vgbm {
++	confess(err("ARGNUM")) if ($#_!=1);
++	VRIa(0xe744,@_);
++}
++sub vzero {
++	vgbm(@_,0);
++}
++sub vone {
++	vgbm(@_,0xffff);
++}
++
++sub vgm {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRIb(0xe746,@_);
++}
++sub vgmb {
++	vgm(@_,0);
++}
++sub vgmh {
++	vgm(@_,1);
++}
++sub vgmf {
++	vgm(@_,2);
++}
++sub vgmg {
++	vgm(@_,3);
++}
++
++sub vl {
++	confess(err("ARGNUM")) if ($#_<1||$#_>2);
++	VRX(0xe706,@_);
++}
++
++sub vlr {
++	confess(err("ARGNUM")) if ($#_!=1);
++	VRRa(0xe756,@_);
++}
++
++sub vlrep {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe705,@_);
++}
++sub vlrepb {
++	vlrep(@_,0);
++}
++sub vlreph {
++	vlrep(@_,1);
++}
++sub vlrepf {
++	vlrep(@_,2);
++}
++sub vlrepg {
++	vlrep(@_,3);
++}
++
++sub vleb {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe700,@_);
++}
++sub vleh {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe701,@_);
++}
++sub vlef {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe703,@_);
++}
++sub vleg {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe702,@_);
++}
++
++sub vleib {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRIa(0xe740,@_);
++}
++sub vleih {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRIa(0xe741,@_);
++}
++sub vleif {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRIa(0xe743,@_);
++}
++sub vleig {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRIa(0xe742,@_);
++}
++
++sub vlgv {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRSc(0xe721,@_);
++}
++sub vlgvb {
++	vlgv(@_,0);
++}
++sub vlgvh {
++	vlgv(@_,1);
++}
++sub vlgvf {
++	vlgv(@_,2);
++}
++sub vlgvg {
++	vlgv(@_,3);
++}
++
++sub vllez {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe704,@_);
++}
++sub vllezb {
++	vllez(@_,0);
++}
++sub vllezh {
++	vllez(@_,1);
++}
++sub vllezf {
++	vllez(@_,2);
++}
++sub vllezg {
++	vllez(@_,3);
++}
++
++sub vlm {
++	confess(err("ARGNUM")) if ($#_<2||$#_>3);
++	VRSa(0xe736,@_);
++}
++
++sub vlbb {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe707,@_);
++}
++
++sub vlvg {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRSb(0xe722,@_);
++}
++sub vlvgb {
++	vlvg(@_,0);
++}
++sub vlvgh {
++	vlvg(@_,1);
++}
++sub vlvgf {
++	vlvg(@_,2);
++}
++sub vlvgg {
++	vlvg(@_,3);
++}
++
++sub vlvgp {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRf(0xe762,@_);
++}
++
++sub vll {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRSb(0xe737,@_);
++}
++
++sub vmrh {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe761,@_);
++}
++sub vmrhb {
++	vmrh(@_,0);
++}
++sub vmrhh {
++	vmrh(@_,1);
++}
++sub vmrhf {
++	vmrh(@_,2);
++}
++sub vmrhg {
++	vmrh(@_,3);
++}
++
++sub vmrl {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe760,@_);
++}
++sub vmrlb {
++	vmrl(@_,0);
++}
++sub vmrlh {
++	vmrl(@_,1);
++}
++sub vmrlf {
++	vmrl(@_,2);
++}
++sub vmrlg {
++	vmrl(@_,3);
++}
++
++sub vpk {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe794,@_);
++}
++sub vpkh {
++	vpk(@_,1);
++}
++sub vpkf {
++	vpk(@_,2);
++}
++sub vpkg {
++	vpk(@_,3);
++}
++
++sub vpks {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRb(0xe797,@_);
++}
++sub vpksh {
++	vpks(@_,1,0);
++}
++sub vpksf {
++	vpks(@_,2,0);
++}
++sub vpksg {
++	vpks(@_,3,0);
++}
++sub vpkshs {
++	vpks(@_,1,1);
++}
++sub vpksfs {
++	vpks(@_,2,1);
++}
++sub vpksgs {
++	vpks(@_,3,1);
++}
++
++sub vpkls {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRb(0xe795,@_);
++}
++sub vpklsh {
++	vpkls(@_,1,0);
++}
++sub vpklsf {
++	vpkls(@_,2,0);
++}
++sub vpklsg {
++	vpkls(@_,3,0);
++}
++sub vpklshs {
++	vpkls(@_,1,1);
++}
++sub vpklsfs {
++	vpkls(@_,2,1);
++}
++sub vpklsgs {
++	vpkls(@_,3,1);
++}
++
++sub vperm {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRe(0xe78c,@_);
++}
++
++sub vpdi {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe784,@_);
++}
++
++sub vrep {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRIc(0xe74d,@_);
++}
++sub vrepb {
++	vrep(@_,0);
++}
++sub vreph {
++	vrep(@_,1);
++}
++sub vrepf {
++	vrep(@_,2);
++}
++sub vrepg {
++	vrep(@_,3);
++}
++
++sub vrepi {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRIa(0xe745,@_);
++}
++sub vrepib {
++	vrepi(@_,0);
++}
++sub vrepih {
++	vrepi(@_,1);
++}
++sub vrepif {
++	vrepi(@_,2);
++}
++sub vrepig {
++	vrepi(@_,3);
++}
++
++sub vscef {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRV(0xe71b,@_);
++}
++sub vsceg {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRV(0xe71a,@_);
++}
++
++sub vsel {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRe(0xe78d,@_);
++}
++
++sub vseg {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe75f,@_);
++}
++sub vsegb {
++	vseg(@_,0);
++}
++sub vsegh {
++	vseg(@_,1);
++}
++sub vsegf {
++	vseg(@_,2);
++}
++
++sub vst {
++	confess(err("ARGNUM")) if ($#_<1||$#_>2);
++	VRX(0xe70e,@_);
++}
++
++sub vsteb {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe708,@_);
++}
++sub vsteh {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe709,@_);
++}
++sub vstef {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe70b,@_);
++}
++sub vsteg {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRX(0xe70a,@_);
++}
++
++sub vstm {
++	confess(err("ARGNUM")) if ($#_<2||$#_>3);
++	VRSa(0xe73e,@_);
++}
++
++sub vstl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRSb(0xe73f,@_);
++}
++
++sub vuph {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7d7,@_);
++}
++sub vuphb {
++	vuph(@_,0);
++}
++sub vuphh {
++	vuph(@_,1);
++}
++sub vuphf {
++	vuph(@_,2);
++}
++
++sub vuplh {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7d5,@_);
++}
++sub vuplhb {
++	vuplh(@_,0);
++}
++sub vuplhh {
++	vuplh(@_,1);
++}
++sub vuplhf {
++	vuplh(@_,2);
++}
++
++sub vupl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7d6,@_);
++}
++sub vuplb {
++	vupl(@_,0);
++}
++sub vuplhw {
++	vupl(@_,1);
++}
++sub vuplf {
++	vupl(@_,2);
++}
++
++sub vupll {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7d4,@_);
++}
++sub vupllb {
++	vupll(@_,0);
++}
++sub vupllh {
++	vupll(@_,1);
++}
++sub vupllf {
++	vupll(@_,2);
++}
++
++# VX - Integer Instructions
++
++sub va {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7f3,@_);
++}
++sub vab {
++	va(@_,0);
++}
++sub vah {
++	va(@_,1);
++}
++sub vaf {
++	va(@_,2);
++}
++sub vag {
++	va(@_,3);
++}
++sub vaq {
++	va(@_,4);
++}
++
++sub vacc {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7f1,@_);
++}
++sub vaccb {
++	vacc(@_,0);
++}
++sub vacch {
++	vacc(@_,1);
++}
++sub vaccf {
++	vacc(@_,2);
++}
++sub vaccg {
++	vacc(@_,3);
++}
++sub vaccq {
++	vacc(@_,4);
++}
++
++sub vac {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7bb,@_);
++}
++sub vacq {
++	vac(@_,4);
++}
++
++sub vaccc {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7b9,@_);
++}
++sub vacccq {
++	vaccc(@_,4);
++}
++
++sub vn {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe768,@_);
++}
++
++sub vnc {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe769,@_);
++}
++
++sub vavg {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7f2,@_);
++}
++sub vavgb {
++	vavg(@_,0);
++}
++sub vavgh {
++	vavg(@_,1);
++}
++sub vavgf {
++	vavg(@_,2);
++}
++sub vavgg {
++	vavg(@_,3);
++}
++
++sub vavgl {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7f0,@_);
++}
++sub vavglb {
++	vavgl(@_,0);
++}
++sub vavglh {
++	vavgl(@_,1);
++}
++sub vavglf {
++	vavgl(@_,2);
++}
++sub vavglg {
++	vavgl(@_,3);
++}
++
++sub vcksm {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe766,@_);
++}
++
++sub vec_ {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7db,@_);
++}
++sub vecb {
++	vec_(@_,0);
++}
++sub vech {
++	vec_(@_,1);
++}
++sub vecf {
++	vec_(@_,2);
++}
++sub vecg {
++	vec_(@_,3);
++}
++
++sub vecl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7d9,@_);
++}
++sub veclb {
++	vecl(@_,0);
++}
++sub veclh {
++	vecl(@_,1);
++}
++sub veclf {
++	vecl(@_,2);
++}
++sub veclg {
++	vecl(@_,3);
++}
++
++sub vceq {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRb(0xe7f8,@_);
++}
++sub vceqb {
++	vceq(@_,0,0);
++}
++sub vceqh {
++	vceq(@_,1,0);
++}
++sub vceqf {
++	vceq(@_,2,0);
++}
++sub vceqg {
++	vceq(@_,3,0);
++}
++sub vceqbs {
++	vceq(@_,0,1);
++}
++sub vceqhs {
++	vceq(@_,1,1);
++}
++sub vceqfs {
++	vceq(@_,2,1);
++}
++sub vceqgs {
++	vceq(@_,3,1);
++}
++
++sub vch {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRb(0xe7fb,@_);
++}
++sub vchb {
++	vch(@_,0,0);
++}
++sub vchh {
++	vch(@_,1,0);
++}
++sub vchf {
++	vch(@_,2,0);
++}
++sub vchg {
++	vch(@_,3,0);
++}
++sub vchbs {
++	vch(@_,0,1);
++}
++sub vchhs {
++	vch(@_,1,1);
++}
++sub vchfs {
++	vch(@_,2,1);
++}
++sub vchgs {
++	vch(@_,3,1);
++}
++
++sub vchl {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRb(0xe7f9,@_);
++}
++sub vchlb {
++	vchl(@_,0,0);
++}
++sub vchlh {
++	vchl(@_,1,0);
++}
++sub vchlf {
++	vchl(@_,2,0);
++}
++sub vchlg {
++	vchl(@_,3,0);
++}
++sub vchlbs {
++	vchl(@_,0,1);
++}
++sub vchlhs {
++	vchl(@_,1,1);
++}
++sub vchlfs {
++	vchl(@_,2,1);
++}
++sub vchlgs {
++	vchl(@_,3,1);
++}
++
++sub vclz {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe753,@_);
++}
++sub vclzb {
++	vclz(@_,0);
++}
++sub vclzh {
++	vclz(@_,1);
++}
++sub vclzf {
++	vclz(@_,2);
++}
++sub vclzg {
++	vclz(@_,3);
++}
++
++sub vctz {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe752,@_);
++}
++sub vctzb {
++	vctz(@_,0);
++}
++sub vctzh {
++	vctz(@_,1);
++}
++sub vctzf {
++	vctz(@_,2);
++}
++sub vctzg {
++	vctz(@_,3);
++}
++
++sub vx {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe76d,@_);
++}
++
++sub vgfm {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7b4,@_);
++}
++sub vgfmb {
++	vgfm(@_,0);
++}
++sub vgfmh {
++	vgfm(@_,1);
++}
++sub vgfmf {
++	vgfm(@_,2);
++}
++sub vgfmg {
++	vgfm(@_,3);
++}
++
++sub vgfma {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7bc,@_);
++}
++sub vgfmab {
++	vgfma(@_,0);
++}
++sub vgfmah {
++	vgfma(@_,1);
++}
++sub vgfmaf {
++	vgfma(@_,2);
++}
++sub vgfmag {
++	vgfma(@_,3);
++}
++
++sub vlc {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7de,@_);
++}
++sub vlcb {
++	vlc(@_,0);
++}
++sub vlch {
++	vlc(@_,1);
++}
++sub vlcf {
++	vlc(@_,2);
++}
++sub vlcg {
++	vlc(@_,3);
++}
++
++sub vlp {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe7df,@_);
++}
++sub vlpb {
++	vlp(@_,0);
++}
++sub vlph {
++	vlp(@_,1);
++}
++sub vlpf {
++	vlp(@_,2);
++}
++sub vlpg {
++	vlp(@_,3);
++}
++
++sub vmx {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7ff,@_);
++}
++sub vmxb {
++	vmx(@_,0);
++}
++sub vmxh {
++	vmx(@_,1);
++}
++sub vmxf {
++	vmx(@_,2);
++}
++sub vmxg {
++	vmx(@_,3);
++}
++
++sub vmxl {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7fd,@_);
++}
++sub vmxlb {
++	vmxl(@_,0);
++}
++sub vmxlh {
++	vmxl(@_,1);
++}
++sub vmxlf {
++	vmxl(@_,2);
++}
++sub vmxlg {
++	vmxl(@_,3);
++}
++
++sub vmn {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7fe,@_);
++}
++sub vmnb {
++	vmn(@_,0);
++}
++sub vmnh {
++	vmn(@_,1);
++}
++sub vmnf {
++	vmn(@_,2);
++}
++sub vmng {
++	vmn(@_,3);
++}
++
++sub vmnl {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7fc,@_);
++}
++sub vmnlb {
++	vmnl(@_,0);
++}
++sub vmnlh {
++	vmnl(@_,1);
++}
++sub vmnlf {
++	vmnl(@_,2);
++}
++sub vmnlg {
++	vmnl(@_,3);
++}
++
++sub vmal {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7aa,@_);
++}
++sub vmalb {
++	vmal(@_,0);
++}
++sub vmalhw {
++	vmal(@_,1);
++}
++sub vmalf {
++	vmal(@_,2);
++}
++
++sub vmah {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7ab,@_);
++}
++sub vmahb {
++	vmah(@_,0);
++}
++sub vmahh {
++	vmah(@_,1);
++}
++sub vmahf {
++	vmah(@_,2);
++}
++
++sub vmalh {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7a9,@_);
++}
++sub vmalhb {
++	vmalh(@_,0);
++}
++sub vmalhh {
++	vmalh(@_,1);
++}
++sub vmalhf {
++	vmalh(@_,2);
++}
++
++sub vmae {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7ae,@_);
++}
++sub vmaeb {
++	vmae(@_,0);
++}
++sub vmaeh {
++	vmae(@_,1);
++}
++sub vmaef {
++	vmae(@_,2);
++}
++
++sub vmale {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7ac,@_);
++}
++sub vmaleb {
++	vmale(@_,0);
++}
++sub vmaleh {
++	vmale(@_,1);
++}
++sub vmalef {
++	vmale(@_,2);
++}
++
++sub vmao {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7af,@_);
++}
++sub vmaob {
++	vmao(@_,0);
++}
++sub vmaoh {
++	vmao(@_,1);
++}
++sub vmaof {
++	vmao(@_,2);
++}
++
++sub vmalo {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7ad,@_);
++}
++sub vmalob {
++	vmalo(@_,0);
++}
++sub vmaloh {
++	vmalo(@_,1);
++}
++sub vmalof {
++	vmalo(@_,2);
++}
++
++sub vmh {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a3,@_);
++}
++sub vmhb {
++	vmh(@_,0);
++}
++sub vmhh {
++	vmh(@_,1);
++}
++sub vmhf {
++	vmh(@_,2);
++}
++
++sub vmlh {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a1,@_);
++}
++sub vmlhb {
++	vmlh(@_,0);
++}
++sub vmlhh {
++	vmlh(@_,1);
++}
++sub vmlhf {
++	vmlh(@_,2);
++}
++
++sub vml {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a2,@_);
++}
++sub vmlb {
++	vml(@_,0);
++}
++sub vmlhw {
++	vml(@_,1);
++}
++sub vmlf {
++	vml(@_,2);
++}
++
++sub vme {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a6,@_);
++}
++sub vmeb {
++	vme(@_,0);
++}
++sub vmeh {
++	vme(@_,1);
++}
++sub vmef {
++	vme(@_,2);
++}
++
++sub vmle {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a4,@_);
++}
++sub vmleb {
++	vmle(@_,0);
++}
++sub vmleh {
++	vmle(@_,1);
++}
++sub vmlef {
++	vmle(@_,2);
++}
++
++sub vmo {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a7,@_);
++}
++sub vmob {
++	vmo(@_,0);
++}
++sub vmoh {
++	vmo(@_,1);
++}
++sub vmof {
++	vmo(@_,2);
++}
++
++sub vmlo {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7a5,@_);
++}
++sub vmlob {
++	vmlo(@_,0);
++}
++sub vmloh {
++	vmlo(@_,1);
++}
++sub vmlof {
++	vmlo(@_,2);
++}
++
++sub vno {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe76b,@_);
++}
++sub vnot {
++	vno(@_,$_[1]);
++}
++
++sub vo {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe76a,@_);
++}
++
++sub vpopct {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRa(0xe750,@_);
++}
++
++sub verllv {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe773,@_);
++}
++sub verllvb {
++	verllv(@_,0);
++}
++sub verllvh {
++	verllv(@_,1);
++}
++sub verllvf {
++	verllv(@_,2);
++}
++sub verllvg {
++	verllv(@_,3);
++}
++
++sub verll {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRSa(0xe733,@_);
++}
++sub verllb {
++	verll(@_,0);
++}
++sub verllh {
++	verll(@_,1);
++}
++sub verllf {
++	verll(@_,2);
++}
++sub verllg {
++	verll(@_,3);
++}
++
++sub verim {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRId(0xe772,@_);
++}
++sub verimb {
++	verim(@_,0);
++}
++sub verimh {
++	verim(@_,1);
++}
++sub verimf {
++	verim(@_,2);
++}
++sub verimg {
++	verim(@_,3);
++}
++
++sub veslv {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe770,@_);
++}
++sub veslvb {
++	veslv(@_,0);
++}
++sub veslvh {
++	veslv(@_,1);
++}
++sub veslvf {
++	veslv(@_,2);
++}
++sub veslvg {
++	veslv(@_,3);
++}
++
++sub vesl {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRSa(0xe730,@_);
++}
++sub veslb {
++	vesl(@_,0);
++}
++sub veslh {
++	vesl(@_,1);
++}
++sub veslf {
++	vesl(@_,2);
++}
++sub veslg {
++	vesl(@_,3);
++}
++
++sub vesrav {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe77a,@_);
++}
++sub vesravb {
++	vesrav(@_,0);
++}
++sub vesravh {
++	vesrav(@_,1);
++}
++sub vesravf {
++	vesrav(@_,2);
++}
++sub vesravg {
++	vesrav(@_,3);
++}
++
++sub vesra {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRSa(0xe73a,@_);
++}
++sub vesrab {
++	vesra(@_,0);
++}
++sub vesrah {
++	vesra(@_,1);
++}
++sub vesraf {
++	vesra(@_,2);
++}
++sub vesrag {
++	vesra(@_,3);
++}
++
++sub vesrlv {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe778,@_);
++}
++sub vesrlvb {
++	vesrlv(@_,0);
++}
++sub vesrlvh {
++	vesrlv(@_,1);
++}
++sub vesrlvf {
++	vesrlv(@_,2);
++}
++sub vesrlvg {
++	vesrlv(@_,3);
++}
++
++sub vesrl {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRSa(0xe738,@_);
++}
++sub vesrlb {
++	vesrl(@_,0);
++}
++sub vesrlh {
++	vesrl(@_,1);
++}
++sub vesrlf {
++	vesrl(@_,2);
++}
++sub vesrlg {
++	vesrl(@_,3);
++}
++
++sub vsl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe774,@_);
++}
++
++sub vslb {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe775,@_);
++}
++
++sub vsldb {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRId(0xe777,@_);
++}
++
++sub vsra {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe77e,@_);
++}
++
++sub vsrab {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe77f,@_);
++}
++
++sub vsrl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe77c,@_);
++}
++
++sub vsrlb {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe77d,@_);
++}
++
++sub vs {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7f7,@_);
++}
++sub vsb {
++	vs(@_,0);
++}
++sub vsh {
++	vs(@_,1);
++}
++sub vsf {
++	vs(@_,2);
++}
++sub vsg {
++	vs(@_,3);
++}
++sub vsq {
++	vs(@_,4);
++}
++
++sub vscbi {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe7f5,@_);
++}
++sub vscbib {
++	vscbi(@_,0);
++}
++sub vscbih {
++	vscbi(@_,1);
++}
++sub vscbif {
++	vscbi(@_,2);
++}
++sub vscbig {
++	vscbi(@_,3);
++}
++sub vscbiq {
++	vscbi(@_,4);
++}
++
++sub vsbi {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7bf,@_);
++}
++sub vsbiq {
++	vsbi(@_,4);
++}
++
++sub vsbcbi {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRd(0xe7bd,@_);
++}
++sub vsbcbiq {
++	vsbcbi(@_,4);
++}
++
++sub vsumg {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe765,@_);
++}
++sub vsumgh {
++	vsumg(@_,1);
++}
++sub vsumgf {
++	vsumg(@_,2);
++}
++
++sub vsumq {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe767,@_);
++}
++sub vsumqf {
++	vsumq(@_,2);
++}
++sub vsumqg {
++	vsumq(@_,3);
++}
++
++sub vsum {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRc(0xe764,@_);
++}
++sub vsumb {
++	vsum(@_,0);
++}
++sub vsumh {
++	vsum(@_,1);
++}
++
++sub vtm {
++	confess(err("ARGNUM")) if ($#_!=1);
++	VRRa(0xe7d8,@_);
++}
++
++# VX - String Instructions
++
++sub vfae {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	VRRb(0xe782,@_);
++}
++sub vfaeb {
++	vfae(@_[0..2],0,$_[3]);
++}
++sub vfaeh {
++	vfae(@_[0..2],1,$_[3]);
++}
++sub vfaef {
++	vfae(@_[0..2],2,$_[3]);
++}
++sub vfaebs {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],0,0x1|$_[3]);
++}
++sub vfaehs {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],1,0x1|$_[3]);
++}
++sub vfaefs {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],2,0x1|$_[3]);
++}
++sub vfaezb {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],0,0x2|$_[3]);
++}
++sub vfaezh {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],1,0x2|$_[3]);
++}
++sub vfaezf {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],2,0x2|$_[3]);
++}
++sub vfaezbs {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],0,0x3|$_[3]);
++}
++sub vfaezhs {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],1,0x3|$_[3]);
++}
++sub vfaezfs {
++	$_[3]=0 if (!defined($_[3]));
++	vfae(@_[0..2],2,0x3|$_[3]);
++}
++
++sub vfee {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	VRRb(0xe780,@_);
++}
++sub vfeeb {
++	vfee(@_[0..2],0,$_[3]);
++}
++sub vfeeh {
++	vfee(@_[0..2],1,$_[3]);
++}
++sub vfeef {
++	vfee(@_[0..2],2,$_[3]);
++}
++sub vfeebs {
++	vfee(@_,0,1);
++}
++sub vfeehs {
++	vfee(@_,1,1);
++}
++sub vfeefs {
++	vfee(@_,2,1);
++}
++sub vfeezb {
++	vfee(@_,0,2);
++}
++sub vfeezh {
++	vfee(@_,1,2);
++}
++sub vfeezf {
++	vfee(@_,2,2);
++}
++sub vfeezbs {
++	vfee(@_,0,3);
++}
++sub vfeezhs {
++	vfee(@_,1,3);
++}
++sub vfeezfs {
++	vfee(@_,2,3);
++}
++
++sub vfene {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	VRRb(0xe781,@_);
++}
++sub vfeneb {
++	vfene(@_[0..2],0,$_[3]);
++}
++sub vfeneh {
++	vfene(@_[0..2],1,$_[3]);
++}
++sub vfenef {
++	vfene(@_[0..2],2,$_[3]);
++}
++sub vfenebs {
++	vfene(@_,0,1);
++}
++sub vfenehs {
++	vfene(@_,1,1);
++}
++sub vfenefs {
++	vfene(@_,2,1);
++}
++sub vfenezb {
++	vfene(@_,0,2);
++}
++sub vfenezh {
++	vfene(@_,1,2);
++}
++sub vfenezf {
++	vfene(@_,2,2);
++}
++sub vfenezbs {
++	vfene(@_,0,3);
++}
++sub vfenezhs {
++	vfene(@_,1,3);
++}
++sub vfenezfs {
++	vfene(@_,2,3);
++}
++
++sub vistr {
++	confess(err("ARGNUM")) if ($#_<2||$#_>3);
++	VRRa(0xe75c,@_[0..2],0,$_[3]);
++}
++sub vistrb {
++	vistr(@_[0..1],0,$_[2]);
++}
++sub vistrh {
++	vistr(@_[0..1],1,$_[2]);
++}
++sub vistrf {
++	vistr(@_[0..1],2,$_[2]);
++}
++sub vistrbs {
++	vistr(@_,0,1);
++}
++sub vistrhs {
++	vistr(@_,1,1);
++}
++sub vistrfs {
++	vistr(@_,2,1);
++}
++
++sub vstrc {
++	confess(err("ARGNUM")) if ($#_<4||$#_>5);
++	VRRd(0xe78a,@_);
++}
++sub vstrcb {
++	vstrc(@_[0..3],0,$_[4]);
++}
++sub vstrch {
++	vstrc(@_[0..3],1,$_[4]);
++}
++sub vstrcf {
++	vstrc(@_[0..3],2,$_[4]);
++}
++sub vstrcbs {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],0,0x1|$_[4]);
++}
++sub vstrchs {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],1,0x1|$_[4]);
++}
++sub vstrcfs {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],2,0x1|$_[4]);
++}
++sub vstrczb {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],0,0x2|$_[4]);
++}
++sub vstrczh {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],1,0x2|$_[4]);
++}
++sub vstrczf {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],2,0x2|$_[4]);
++}
++sub vstrczbs {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],0,0x3|$_[4]);
++}
++sub vstrczhs {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],1,0x3|$_[4]);
++}
++sub vstrczfs {
++	$_[4]=0 if (!defined($_[4]));
++	vstrc(@_[0..3],2,0x3|$_[4]);
++}
++
++# VX - Floating-point Instructions
++
++sub vfa {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRc(0xe7e3,@_);
++}
++sub vfadb {
++	vfa(@_,3,0);
++}
++sub wfadb {
++	vfa(@_,3,8);
++}
++
++sub wfc {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRa(0xe7cb,@_);
++}
++sub wfcdb {
++	wfc(@_,3,0);
++}
++
++sub wfk {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRa(0xe7ca,@_);
++}
++sub wfksb {
++	wfk(@_,2,0);
++}
++sub wfkdb {
++	wfk(@_,3,0);
++}
++sub wfkxb {
++	wfk(@_,4,0);
++}
++
++sub vfce {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRc(0xe7e8,@_);
++}
++sub vfcedb {
++	vfce(@_,3,0,0);
++}
++sub vfcedbs {
++	vfce(@_,3,0,1);
++}
++sub wfcedb {
++	vfce(@_,3,8,0);
++}
++sub wfcedbs {
++	vfce(@_,3,8,1);
++}
++
++sub vfch {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRc(0xe7eb,@_);
++}
++sub vfchdb {
++	vfch(@_,3,0,0);
++}
++sub vfchdbs {
++	vfch(@_,3,0,1);
++}
++sub wfchdb {
++	vfch(@_,3,8,0);
++}
++sub wfchdbs {
++	vfch(@_,3,8,1);
++}
++
++sub vfche {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRc(0xe7ea,@_);
++}
++sub vfchedb {
++	vfche(@_,3,0,0);
++}
++sub vfchedbs {
++	vfche(@_,3,0,1);
++}
++sub wfchedb {
++	vfche(@_,3,8,0);
++}
++sub wfchedbs {
++	vfche(@_,3,8,1);
++}
++
++sub vcdg {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7c3,@_);
++}
++sub vcdgb {
++	vcdg(@_[0..1],3,@_[2..3]);
++}
++sub wcdgb {
++	vcdg(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++
++sub vcdlg {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7c1,@_);
++}
++sub vcdlgb {
++	vcdlg(@_[0..1],3,@_[2..3]);
++}
++sub wcdlgb {
++	vcdlg(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++
++sub vcgd {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7c2,@_);
++}
++sub vcgdb {
++	vcgd(@_[0..1],3,@_[2..3]);
++}
++sub wcgdb {
++	vcgd(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++
++sub vclgd {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7c0,@_);
++}
++sub vclgdb {
++	vclgd(@_[0..1],3,@_[2..3]);
++}
++sub wclgdb {
++	vclgd(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++
++sub vfd {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRc(0xe7e5,@_);
++}
++sub vfddb {
++	vfd(@_,3,0);
++}
++sub wfddb {
++	vfd(@_,3,8);
++}
++
++sub vfi {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7c7,@_);
++}
++sub vfidb {
++	vfi(@_[0..1],3,@_[2..3]);
++}
++sub wfidb {
++	vfi(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++
++sub vlde {	# deprecated, use vfll
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRa(0xe7c4,@_);
++}
++sub vldeb {	# deprecated, use vflls
++	vlde(@_,2,0);
++}
++sub wldeb {	# deprecated, use wflls
++	vlde(@_,2,8);
++}
++
++sub vled {	# deprecated, use vflr
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7c5,@_);
++}
++sub vledb {	# deprecated, use vflrd
++	vled(@_[0..1],3,@_[2..3]);
++}
++sub wledb {	# deprecated, use wflrd
++	vled(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++
++sub vfm {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRc(0xe7e7,@_);
++}
++sub vfmdb {
++	vfm(@_,3,0);
++}
++sub wfmdb {
++	vfm(@_,3,8);
++}
++
++sub vfma {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRe(0xe78f,@_);
++}
++sub vfmadb {
++	vfma(@_,0,3);
++}
++sub wfmadb {
++	vfma(@_,8,3);
++}
++
++sub vfms {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRe(0xe78e,@_);
++}
++sub vfmsdb {
++	vfms(@_,0,3);
++}
++sub wfmsdb {
++	vfms(@_,8,3);
++}
++
++sub vfpso {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRa(0xe7cc,@_);
++}
++sub vfpsodb {
++	vfpso(@_[0..1],3,0,$_[2]);
++}
++sub wfpsodb {
++	vfpso(@_[0..1],3,8,$_[2]);
++}
++sub vflcdb {
++	vfpso(@_,3,0,0);
++}
++sub wflcdb {
++	vfpso(@_,3,8,0);
++}
++sub vflndb {
++	vfpso(@_,3,0,1);
++}
++sub wflndb {
++	vfpso(@_,3,8,1);
++}
++sub vflpdb {
++	vfpso(@_,3,0,2);
++}
++sub wflpdb {
++	vfpso(@_,3,8,2);
++}
++
++sub vfsq {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRRa(0xe7ce,@_);
++}
++sub vfsqdb {
++	vfsq(@_,3,0);
++}
++sub wfsqdb {
++	vfsq(@_,3,8);
++}
++
++sub vfs {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRRc(0xe7e2,@_);
++}
++sub vfsdb {
++	vfs(@_,3,0);
++}
++sub wfsdb {
++	vfs(@_,3,8);
++}
++
++sub vftci {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIe(0xe74a,@_);
++}
++sub vftcidb {
++	vftci(@_,3,0);
++}
++sub wftcidb {
++	vftci(@_,3,8);
++}
++
++# VXE - Support Instructions
++
++sub vbperm {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe785,@_);
++}
++
++sub vllezlf {
++	vllez(@_,6);
++}
++
++# VXE - Integer Instructions
++
++sub vmsl {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRd(0xe7b8,@_);
++}
++sub vmslg {
++	vmsl(@_[0..3],3,$_[4]);
++}
++
++sub vnx {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe76c,@_);
++}
++
++sub vnn {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe76e,@_);
++}
++
++sub voc {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRc(0xe76f,@_);
++}
++
++sub vpopctb {
++	vpopct(@_,0);
++}
++sub vpopcth {
++	vpopct(@_,1);
++}
++sub vpopctf {
++	vpopct(@_,2);
++}
++sub vpopctg {
++	vpopct(@_,3);
++}
++
++# VXE - Floating-Point Instructions
++
++sub vfasb {
++	vfa(@_,2,0);
++}
++sub wfasb {
++	vfa(@_,2,8);
++}
++sub wfaxb {
++	vfa(@_,4,8);
++}
++
++sub wfcsb {
++	wfc(@_,2,0);
++}
++sub wfcxb {
++	wfc(@_,4,0);
++}
++
++sub vfcesb {
++	vfce(@_,2,0,0);
++}
++sub vfcesbs {
++	vfce(@_,2,0,1);
++}
++sub wfcesb {
++	vfce(@_,2,8,0);
++}
++sub wfcesbs {
++	vfce(@_,2,8,1);
++}
++sub wfcexb {
++	vfce(@_,4,8,0);
++}
++sub wfcexbs {
++	vfce(@_,4,8,1);
++}
++
++sub vfchsb {
++	vfch(@_,2,0,0);
++}
++sub vfchsbs {
++	vfch(@_,2,0,1);
++}
++sub wfchsb {
++	vfch(@_,2,8,0);
++}
++sub wfchsbs {
++	vfch(@_,2,8,1);
++}
++sub wfchxb {
++	vfch(@_,4,8,0);
++}
++sub wfchxbs {
++	vfch(@_,4,8,1);
++}
++
++sub vfchesb {
++	vfche(@_,2,0,0);
++}
++sub vfchesbs {
++	vfche(@_,2,0,1);
++}
++sub wfchesb {
++	vfche(@_,2,8,0);
++}
++sub wfchesbs {
++	vfche(@_,2,8,1);
++}
++sub wfchexb {
++	vfche(@_,4,8,0);
++}
++sub wfchexbs {
++	vfche(@_,4,8,1);
++}
++
++sub vfdsb {
++	vfd(@_,2,0);
++}
++sub wfdsb {
++	vfd(@_,2,8);
++}
++sub wfdxb {
++	vfd(@_,4,8);
++}
++
++sub vfisb {
++	vfi(@_[0..1],2,@_[2..3]);
++}
++sub wfisb {
++	vfi(@_[0..1],2,0x8|$_[2],$_[3]);
++}
++sub wfixb {
++	vfi(@_[0..1],4,0x8|$_[2],$_[3]);
++}
++
++sub vfll {
++	vlde(@_);
++}
++sub vflls {
++	vfll(@_,2,0);
++}
++sub wflls {
++	vfll(@_,2,8);
++}
++sub wflld {
++	vfll(@_,3,8);
++}
++
++sub vflr {
++	vled(@_);
++}
++sub vflrd {
++	vflr(@_[0..1],3,@_[2..3]);
++}
++sub wflrd {
++	vflr(@_[0..1],3,0x8|$_[2],$_[3]);
++}
++sub wflrx {
++	vflr(@_[0..1],4,0x8|$_[2],$_[3]);
++}
++
++sub vfmax {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRc(0xe7ef,@_);
++}
++sub vfmaxsb {
++	vfmax(@_[0..2],2,0,$_[3]);
++}
++sub vfmaxdb {
++	vfmax(@_[0..2],3,0,$_[3]);
++}
++sub wfmaxsb {
++	vfmax(@_[0..2],2,8,$_[3]);
++}
++sub wfmaxdb {
++	vfmax(@_[0..2],3,8,$_[3]);
++}
++sub wfmaxxb {
++	vfmax(@_[0..2],4,8,$_[3]);
++}
++
++sub vfmin {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRc(0xe7ee,@_);
++}
++sub vfminsb {
++	vfmin(@_[0..2],2,0,$_[5]);
++}
++sub vfmindb {
++	vfmin(@_[0..2],3,0,$_[5]);
++}
++sub wfminsb {
++	vfmin(@_[0..2],2,8,$_[5]);
++}
++sub wfmindb {
++	vfmin(@_[0..2],3,8,$_[5]);
++}
++sub wfminxb {
++	vfmin(@_[0..2],4,8,$_[5]);
++}
++
++sub vfmsb {
++	vfm(@_,2,0);
++}
++sub wfmsb {
++	vfm(@_,2,8);
++}
++sub wfmxb {
++	vfm(@_,4,8);
++}
++
++sub vfmasb {
++	vfma(@_,0,2);
++}
++sub wfmasb {
++	vfma(@_,8,2);
++}
++sub wfmaxb {
++	vfma(@_,8,4);
++}
++
++sub vfmssb {
++	vfms(@_,0,2);
++}
++sub wfmssb {
++	vfms(@_,8,2);
++}
++sub wfmsxb {
++	vfms(@_,8,4);
++}
++
++sub vfnma {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRe(0xe79f,@_);
++}
++sub vfnmasb {
++	vfnma(@_,0,2);
++}
++sub vfnmadb {
++	vfnma(@_,0,3);
++}
++sub wfnmasb {
++	vfnma(@_,8,2);
++}
++sub wfnmadb {
++	vfnma(@_,8,3);
++}
++sub wfnmaxb {
++	vfnma(@_,8,4);
++}
++
++sub vfnms {
++	confess(err("ARGNUM")) if ($#_!=5);
++	VRRe(0xe79e,@_);
++}
++sub vfnmssb {
++	vfnms(@_,0,2);
++}
++sub vfnmsdb {
++	vfnms(@_,0,3);
++}
++sub wfnmssb {
++	vfnms(@_,8,2);
++}
++sub wfnmsdb {
++	vfnms(@_,8,3);
++}
++sub wfnmsxb {
++	vfnms(@_,8,4);
++}
++
++sub vfpsosb {
++	vfpso(@_[0..1],2,0,$_[2]);
++}
++sub wfpsosb {
++	vfpso(@_[0..1],2,8,$_[2]);
++}
++sub vflcsb {
++	vfpso(@_,2,0,0);
++}
++sub wflcsb {
++	vfpso(@_,2,8,0);
++}
++sub vflnsb {
++	vfpso(@_,2,0,1);
++}
++sub wflnsb {
++	vfpso(@_,2,8,1);
++}
++sub vflpsb {
++	vfpso(@_,2,0,2);
++}
++sub wflpsb {
++	vfpso(@_,2,8,2);
++}
++sub vfpsoxb {
++	vfpso(@_[0..1],4,0,$_[2]);
++}
++sub wfpsoxb {
++	vfpso(@_[0..1],4,8,$_[2]);
++}
++sub vflcxb {
++	vfpso(@_,4,0,0);
++}
++sub wflcxb {
++	vfpso(@_,4,8,0);
++}
++sub vflnxb {
++	vfpso(@_,4,0,1);
++}
++sub wflnxb {
++	vfpso(@_,4,8,1);
++}
++sub vflpxb {
++	vfpso(@_,4,0,2);
++}
++sub wflpxb {
++	vfpso(@_,4,8,2);
++}
++
++sub vfsqsb {
++	vfsq(@_,2,0);
++}
++sub wfsqsb {
++	vfsq(@_,2,8);
++}
++sub wfsqxb {
++	vfsq(@_,4,8);
++}
++
++sub vfssb {
++	vfs(@_,2,0);
++}
++sub wfssb {
++	vfs(@_,2,8);
++}
++sub wfsxb {
++	vfs(@_,4,8);
++}
++
++sub vftcisb {
++	vftci(@_,2,0);
++}
++sub wftcisb {
++	vftci(@_,2,8);
++}
++sub wftcixb {
++	vftci(@_,4,8);
++}
++
++# VXD - Support Instructions
++
++sub vlrlr {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRSd(0xe637,@_);
++}
++
++sub vlrl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VSI(0xe635,@_);
++}
++
++sub vstrlr {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRSd(0xe63f,@_);
++}
++
++sub vstrl {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VSI(0xe63d,@_);
++}
++
++sub vap {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe671,@_);
++}
++
++sub vcp {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRh(0xe677,@_);
++}
++
++sub vcvb {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRi(0xe650,@_);
++}
++
++sub vcvbg {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRRi(0xe652,@_);
++}
++
++sub vcvd {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRIi(0xe658,@_);
++}
++
++sub vcvdg {
++	confess(err("ARGNUM")) if ($#_!=3);
++	VRIi(0xe65a,@_);
++}
++
++sub vdp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe67a,@_);
++}
++
++sub vlip {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VRIh(0xe649,@_);
++}
++
++sub vmp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe678,@_);
++}
++
++sub vmsp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe679,@_);
++}
++
++sub vpkz {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VSI(0xe634,@_);
++}
++
++sub vpsop {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIg(0xe65b,@_);
++}
++
++sub vrp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe67b,@_);
++}
++
++sub vsdp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe67e,@_);
++}
++
++sub vsrp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIg(0xe659,@_);
++}
++
++sub vsp {
++	confess(err("ARGNUM")) if ($#_!=4);
++	VRIf(0xe673,@_);
++}
++
++sub vtp {
++	confess(err("ARGNUM")) if ($#_!=0);
++	VRRg(0xe65f,@_);
++}
++
++sub vupkz {
++	confess(err("ARGNUM")) if ($#_!=2);
++	VSI(0xe63c,@_);
++}
++
++#
++# Instruction Formats
++#
++
++sub RIEf {
++	confess(err("ARGNUM")) if ($#_<4||5<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$r2,$i3,$i4,$i5)=(shift,get_R(shift),get_R(shift),
++					  get_I(shift,8),get_I(shift,8),
++					  get_I(shift,8));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",(($opcode>>8)<<8|$r1<<4|$r2)).",";
++	$out.=sprintf("%#06x",($i3<<8)|$i4).",";
++	$out.=sprintf("%#06x",($i5<<8)|($opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub RILa {
++	confess(err("ARGNUM")) if ($#_!=2);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$i2)=(shift,get_R(shift),get_I(shift,32));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",(($opcode>>4)<<8|$r1<<4|($opcode&0xf))).",";
++	$out.=sprintf("%#06x",($i2>>16)).",";
++	$out.=sprintf("%#06x",($i2&0xffff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub RRE {
++	confess(err("ARGNUM")) if ($#_<0||2<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$r2)=(shift,get_R(shift),get_R(shift));
++
++	$out.="\t.long\t".sprintf("%#010x",($opcode<<16|$r1<<4|$r2));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub RRFb {
++	confess(err("ARGNUM")) if ($#_<3||4<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$r3,$r2,$m4)=(shift,get_R(shift),get_R(shift)
++	    ,get_R(shift),get_M(shift));
++
++	$out.="\t.long\t"
++	    .sprintf("%#010x",($opcode<<16|$r3<<12|$m4<<8|$r1<<4|$r2));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub RXYa {
++	confess(err("ARGNUM")) if ($#_!=2);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$d2,$x2,$b2)=(shift,get_R(shift),get_DXB(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",(($opcode>>8)<<8|$r1<<4|$x2)).",";
++	$out.=sprintf("%#06x",($b2<<12|($d2&0xfff))).",";
++	$out.=sprintf("%#06x",(($d2>>12)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub S {
++	confess(err("ARGNUM")) if ($#_<0||1<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$d2,$b2)=(shift,get_DB(shift));
++
++	$out.="\t.long\t".sprintf("%#010x",($opcode<<16|$b2<<12|$d2));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIa {
++	confess(err("ARGNUM")) if ($#_<2||3<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$i2,$m3)=(shift,get_V(shift),get_I(shift,16),
++	    get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).",";
++	$out.=sprintf("%#06x",$i2).",";
++	$out.=sprintf("%#06x",($m3<<12|RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIb {
++	confess(err("ARGNUM")) if ($#_!=4);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$i2,$i3,$m4)=(shift,get_V(shift),get_I(shift,8),
++	    ,get_I(shift,8),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).",";
++	$out.=sprintf("%#06x",($i2<<8|$i3)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIc {
++	confess(err("ARGNUM")) if ($#_!=4);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v3,$i2,$m4)=(shift,get_V(shift),get_V(shift),
++	    ,get_I(shift,16),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v3&0xf)).",";
++	$out.=sprintf("%#06x",$i2).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRId {
++	confess(err("ARGNUM")) if ($#_<4||$#_>5);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$v3,$i4,$m5)=(shift,get_V(shift),get_V(shift),
++	    ,get_V(shift),get_I(shift,8),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
++	$out.=sprintf("%#06x",(($v3&0xf)<<12|$i4)).",";
++	$out.=sprintf("%#06x",($m5<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIe {
++	confess(err("ARGNUM")) if ($#_!=5);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$i3,$m4,$m5)=(shift,get_V(shift),get_V(shift),
++	    ,get_I(shift,12),get_M(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
++	$out.=sprintf("%#06x",($i3<<4|$m5)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIf {
++	confess(err("ARGNUM")) if ($#_!=5);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$v3,$i4,$m5)=(shift,get_V(shift),get_V(shift),
++	    ,get_V(shift),get_I(shift,8),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
++	$out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<4)|$i4>>4).",";
++	$out.=sprintf("%#06x",(($i4&0xf)<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIg {
++	confess(err("ARGNUM")) if ($#_!=5);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$i3,$i4,$m5)=(shift,get_V(shift),get_V(shift),
++	    ,get_I(shift,8),get_I(shift,8),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
++	$out.=sprintf("%#06x",($i4<<8|$m5<<4|$i3>>4)).",";
++	$out.=sprintf("%#06x",(($i3&0xf)<<12|RXB($v1,$v2)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIh {
++	confess(err("ARGNUM")) if ($#_!=3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$i2,$i3)=(shift,get_V(shift),get_I(shift,16),
++	    get_I(shift,4));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).",";
++	$out.=sprintf("%#06x",$i2).",";
++	$out.=sprintf("%#06x",($i3<<12|RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRIi {
++	confess(err("ARGNUM")) if ($#_!=4);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$r2,$i3,$m4)=(shift,get_V(shift),get_R(shift),
++	    ,get_I(shift,8),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|$r2).",";
++	$out.=sprintf("%#06x",($m4<<4|$i3>>4)).",";
++	$out.=sprintf("%#06x",(($i3&0xf)<<12|RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRa {
++	confess(err("ARGNUM")) if ($#_<2||5<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$m3,$m4,$m5)=(shift,get_V(shift),get_V(shift),
++	    get_M(shift),get_M(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",($m5<<4|$m4)).",";
++	$out.=sprintf("%#06x",($m3<<12|RXB($v1,$v2)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRb {
++	confess(err("ARGNUM")) if ($#_<3||5<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$v3,$m4,$m5)=(shift,get_V(shift),get_V(shift),
++	    get_V(shift),get_M(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<4)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRc {
++	confess(err("ARGNUM")) if ($#_<3||6<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$v3,$m4,$m5,$m6)=(shift,get_V(shift),get_V(shift),
++	    get_V(shift),get_M(shift),get_M(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",(($v3&0xf)<<12|$m6<<4|$m5)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRd {
++	confess(err("ARGNUM")) if ($#_<4||6<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$v3,$v4,$m5,$m6)=(shift,get_V(shift),get_V(shift),
++	    get_V(shift),get_V(shift),get_M(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<8|$m6<<4)).",";
++	$out.=sprintf("%#06x",(($v4&0xf)<<12|RXB($v1,$v2,$v3,$v4)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRe {
++	confess(err("ARGNUM")) if ($#_<4||6<$#_);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$v3,$v4,$m5,$m6)=(shift,get_V(shift),get_V(shift),
++	    get_V(shift),get_V(shift),get_M(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",(($v3&0xf)<<12|$m6<<8|$m5)).",";
++	$out.=sprintf("%#06x",(($v4&0xf)<<12|RXB($v1,$v2,$v3,$v4)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRf {
++	confess(err("ARGNUM")) if ($#_!=3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$r2,$r3)=(shift,get_V(shift),get_R(shift),
++	    get_R(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|$r2)).",";
++	$out.=sprintf("%#06x",($r3<<12)).",";
++	$out.=sprintf("%#06x",(RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRg {
++	confess(err("ARGNUM")) if ($#_!=1);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1)=(shift,get_V(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf))).",";
++	$out.=sprintf("%#06x",0x0000).",";
++	$out.=sprintf("%#06x",(RXB(0,$v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRh {
++	confess(err("ARGNUM")) if ($#_<2||$#_>3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v2,$m3)=(shift,get_V(shift),get_V(shift),
++	    get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf))).",";
++	$out.=sprintf("%#06x",(($v2&0xf)<<12|$m3<<4)).",";
++	$out.=sprintf("%#06x",(RXB(0,$v1,$v2)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRRi {
++	confess(err("ARGNUM")) if ($#_!=3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$v2,$m3)=(shift,get_R(shift),get_V(shift),
++	    get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|$r1<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",($m3<<4))."\,";
++	$out.=sprintf("%#06x",(RXB(0,$v2)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRSa {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$v3,$d2,$b2,$m4)=(shift,get_V(shift),get_V(shift),
++	    get_DB(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v3&0xf))).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRSb {
++	confess(err("ARGNUM")) if ($#_<3||$#_>4);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$r3,$d2,$b2,$m4)=(shift,get_V(shift),get_R(shift),
++	    get_DB(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|$r3)).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRSc {
++	confess(err("ARGNUM")) if ($#_!=4);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$r1,$v3,$d2,$b2,$m4)=(shift,get_R(shift),get_V(shift),
++	    get_DB(shift),get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|$r1<<4|($v3&0xf))).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",($m4<<12|RXB(0,$v3)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRSd {
++	confess(err("ARGNUM")) if ($#_!=3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$r3,$d2,$b2)=(shift,get_V(shift),get_R(shift),
++	    get_DB(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|$r3)).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",(($v1&0xf)<<12|RXB(0,0,0,$v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRV {
++	confess(err("ARGNUM")) if ($#_<2||$#_>3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$d2,$v2,$b2,$m3)=(shift,get_V(shift),get_DVB(shift),
++	    get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",($m3<<12|RXB($v1,$v2)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VRX {
++	confess(err("ARGNUM")) if ($#_<2||$#_>3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$d2,$x2,$b2,$m3)=(shift,get_V(shift),get_DXB(shift),
++	    get_M(shift));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($x2))).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",($m3<<12|RXB($v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++sub VSI {
++	confess(err("ARGNUM")) if ($#_!=3);
++	my $ops=join(',',@_[1..$#_]);
++	my $memn=(caller(1))[3];
++	$memn=~s/^.*:://;
++	my ($opcode,$v1,$d2,$b2,$i3)=(shift,get_V(shift),get_DB(shift),
++	    get_I(shift,8));
++
++	$out.="\t.word\t";
++	$out.=sprintf("%#06x",($opcode&0xff00|$i3)).",";
++	$out.=sprintf("%#06x",($b2<<12|$d2)).",";
++	$out.=sprintf("%#06x",(($v1&0xf)<<12|RXB(0,0,0,$v1)<<8|$opcode&0xff));
++	$out.="\t# $memn\t$ops\n"
++}
++
++#
++# Internal
++#
++
++sub get_R {
++	confess(err("ARGNUM")) if ($#_!=0);
++	my $r;
++
++	for (shift) {
++		if (!defined) {
++			$r=0;
++		} elsif (/^$GR$/) {
++			$r=$1;
++		} else {
++			confess(err("PARSE"));
++		}
++	}
++	confess(err("ARGRANGE")) if ($r&~0xf);
++
++	return $r;
++}
++
++sub get_V {
++	confess(err("ARGNUM")) if ($#_!=0);
++	my $v;
++
++	for (shift) {
++		if (!defined) {
++			$v=0;
++		} elsif (/^$VR$/) {
++			$v=$1;
++		} else {
++			confess(err("PARSE"));
++		}
++	}
++	confess(err("ARGRANGE")) if ($v&~0x1f);
++
++	return $v;
++}
++
++sub get_I {
++	confess(err("ARGNUM")) if ($#_!=1);
++	my ($i,$bits)=(shift,shift);
++
++	$i=defined($i)?(eval($i)):(0);
++	confess(err("PARSE")) if (!defined($i));
++	confess(err("ARGRANGE")) if (abs($i)&~(2**$bits-1));
++
++	return $i&(2**$bits-1);
++}
++
++sub get_M {
++	confess(err("ARGNUM")) if ($#_!=0);
++	my $m=shift;
++
++	$m=defined($m)?(eval($m)):(0);
++	confess(err("PARSE")) if (!defined($m));
++	confess(err("ARGRANGE")) if ($m&~0xf);
++
++	return $m;
++}
++
++sub get_DB
++{
++	confess(err("ARGNUM")) if ($#_!=0);
++	my ($d,$b);
++
++	for (shift) {
++		if (!defined) {
++			($d,$b)=(0,0);
++		} elsif (/^(.+)\($GR\)$/) {
++			($d,$b)=(eval($1),$2);
++			confess(err("PARSE")) if (!defined($d));
++		} elsif (/^(.+)$/) {
++			($d,$b)=(eval($1),0);
++			confess(err("PARSE")) if (!defined($d));
++		} else {
++			confess(err("PARSE"));
++		}
++	}
++	confess(err("ARGRANGE")) if ($d&~0xfff||$b&~0xf);
++
++	return ($d,$b);
++}
++
++sub get_DVB
++{
++	confess(err("ARGNUM")) if ($#_!=0);
++	my ($d,$v,$b);
++
++	for (shift) {
++		if (!defined) {
++			($d,$v,$b)=(0,0,0);
++		} elsif (/^(.+)\($VR,$GR\)$/) {
++			($d,$v,$b)=(eval($1),$2,$3);
++			confess(err("PARSE")) if (!defined($d));
++		} elsif (/^(.+)\($GR\)$/) {
++			($d,$v,$b)=(eval($1),0,$2);
++			confess(err("PARSE")) if (!defined($d));
++		} elsif (/^(.+)$/) {
++			($d,$v,$b)=(eval($1),0,0);
++			confess(err("PARSE")) if (!defined($d));
++		} else {
++			confess(err("PARSE"));
++		}
++	}
++	confess(err("ARGRANGE")) if ($d&~0xfff||$v&~0x1f||$b&~0xf);
++
++	return ($d,$v,$b);
++}
++
++sub get_DXB
++{
++	confess(err("ARGNUM")) if ($#_!=0);
++	my ($d,$x,$b);
++
++	for (shift) {
++		if (!defined) {
++			($d,$x,$b)=(0,0,0);
++		} elsif (/^(.+)\($GR,$GR\)$/) {
++			($d,$x,$b)=(eval($1),$2,$3);
++			confess(err("PARSE")) if (!defined($d));
++		} elsif (/^(.+)\($GR\)$/) {
++			($d,$x,$b)=(eval($1),0,$2);
++			confess(err("PARSE")) if (!defined($d));
++		} elsif (/^(.+)$/) {
++			($d,$x,$b)=(eval($1),0,0);
++			confess(err("PARSE")) if (!defined($d));
++		} else {
++			confess(err("PARSE"));
++		}
++	}
++	confess(err("ARGRANGE")) if ($d&~0xfff||$x&~0xf||$b&~0xf);
++
++	return ($d,$x,$b);
++}
++
++sub RXB
++{
++	confess(err("ARGNUM")) if ($#_<0||3<$#_);
++	my $rxb=0;
++
++	$rxb|=0x08 if (defined($_[0])&&($_[0]&0x10));
++	$rxb|=0x04 if (defined($_[1])&&($_[1]&0x10));
++	$rxb|=0x02 if (defined($_[2])&&($_[2]&0x10));
++	$rxb|=0x01 if (defined($_[3])&&($_[3]&0x10));
++
++	return $rxb;
++}
++
++sub err {
++	my %ERR		=
++	(
++		ARGNUM	=>	'Wrong number of arguments',
++		ARGRANGE=>	'Argument out of range',
++		PARSE	=>	'Parse error',
++	);
++	confess($ERR{ARGNUM}) if ($#_!=0);
++
++	return $ERR{$_[0]};
++}
++
++1;
+diff -up openssl-1.1.1b/crypto/poly1305/asm/poly1305-s390x.pl.s390x-update openssl-1.1.1b/crypto/poly1305/asm/poly1305-s390x.pl
+--- openssl-1.1.1b/crypto/poly1305/asm/poly1305-s390x.pl.s390x-update	2019-02-26 15:15:30.000000000 +0100
++++ openssl-1.1.1b/crypto/poly1305/asm/poly1305-s390x.pl	2019-05-06 10:59:30.860784805 +0200
+@@ -24,204 +24,961 @@
+ #
+ # On side note, z13 enables vector base 2^26 implementation...
+ 
+-$flavour = shift;
++#
++# January 2019
++#
++# Add vx code path (base 2^26).
++#
++# Copyright IBM Corp. 2019
++# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
++
++#
++# January 2019
++#
++# Add vector base 2^26 implementation. It's problematic to accurately
++# measure performance, because reference system is hardly idle. But
++# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
++# >=20% faster than IBM's submission on long inputs, and much faster on
++# short ones, because calculation of key powers is postponed till we
++# know that input is long enough to justify the additional overhead.
++
++use strict;
++use FindBin qw($Bin);
++use lib "$Bin/../..";
++use perlasm::s390x qw(:DEFAULT :LD :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
++
++my $flavour = shift;
+ 
++my ($z,$SIZE_T);
+ if ($flavour =~ /3[12]/) {
++	$z=0;	# S/390 ABI
+ 	$SIZE_T=4;
+-	$g="";
+ } else {
++	$z=1;	# zSeries ABI
+ 	$SIZE_T=8;
+-	$g="g";
+ }
+ 
++my $output;
+ while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+-open STDOUT,">$output";
+ 
+-$sp="%r15";
++my $stdframe=16*$SIZE_T+4*8;
++my $sp="%r15";
+ 
+ my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
+ 
+-$code.=<<___;
+-.text
++PERLASM_BEGIN($output);
+ 
+-.globl	poly1305_init
+-.type	poly1305_init,\@function
+-.align	16
+-poly1305_init:
+-	lghi	%r0,0
+-	lghi	%r1,-1
+-	stg	%r0,0($ctx)		# zero hash value
+-	stg	%r0,8($ctx)
+-	stg	%r0,16($ctx)
+-
+-	cl${g}r	$inp,%r0
+-	je	.Lno_key
+-
+-	lrvg	%r4,0($inp)		# load little-endian key
+-	lrvg	%r5,8($inp)
+-
+-	nihl	%r1,0xffc0		# 0xffffffc0ffffffff
+-	srlg	%r0,%r1,4		# 0x0ffffffc0fffffff
+-	srlg	%r1,%r1,4
+-	nill	%r1,0xfffc		# 0x0ffffffc0ffffffc
+-
+-	ngr	%r4,%r0
+-	ngr	%r5,%r1
+-
+-	stg	%r4,32($ctx)
+-	stg	%r5,40($ctx)
+-
+-.Lno_key:
+-	lghi	%r2,0
+-	br	%r14
+-.size	poly1305_init,.-poly1305_init
+-___
++INCLUDE	("s390x_arch.h");
++TEXT	();
++
++################
++# static void poly1305_init(void *ctx, const unsigned char key[16])
++{
++GLOBL	("poly1305_init");
++TYPE	("poly1305_init","\@function");
++ALIGN	(16);
++LABEL	("poly1305_init");
++	lghi	("%r0",0);
++	lghi	("%r1",-1);
++	stg	("%r0","0($ctx)");		# zero hash value
++	stg	("%r0","8($ctx)");
++	stg	("%r0","16($ctx)");
++	st	("%r0","24($ctx)");		# clear is_base2_26
++	lgr	("%r5",$ctx);			# reassign $ctx
++	lghi	("%r2",0);
++
++&{$z?	\&clgr:\&clr}	($inp,"%r0");
++	je	(".Lno_key");
++
++	lrvg	("%r2","0($inp)");		# load little-endian key
++	lrvg	("%r3","8($inp)");
++
++	nihl	("%r1",0xffc0);			# 0xffffffc0ffffffff
++	srlg	("%r0","%r1",4);		# 0x0ffffffc0fffffff
++	srlg	("%r1","%r1",4);
++	nill	("%r1",0xfffc);			# 0x0ffffffc0ffffffc
++
++	ngr	("%r2","%r0");
++	ngr	("%r3","%r1");
++
++	stmg	("%r2","%r3","32(%r5)");
++
++	larl	("%r1","OPENSSL_s390xcap_P");
++	lg	("%r0","16(%r1)");
++	srlg	("%r0","%r0",62);
++	nill	("%r0",1);			# extract vx bit
++	lcgr	("%r0","%r0");
++	larl	("%r1",".Lpoly1305_blocks");
++	larl	("%r2",".Lpoly1305_blocks_vx");
++	larl	("%r3",".Lpoly1305_emit");
++&{$z?	\&xgr:\&xr}	("%r2","%r1");		# select between scalar and vector
++&{$z?	\&ngr:\&nr}	("%r2","%r0");
++&{$z?	\&xgr:\&xr}	("%r2","%r1");
++&{$z?	\&stmg:\&stm}	("%r2","%r3","0(%r4)");
++	lghi	("%r2",1);
++LABEL	(".Lno_key");
++	br	("%r14");
++SIZE	("poly1305_init",".-poly1305_init");
++}
++
++################
++# static void poly1305_blocks(void *ctx, const unsigned char *inp,
++#                             size_t len, u32 padbit)
+ {
+ my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
+ my ($r0,$r1,$s1) = map("%r$_",(0..2));
+ 
+-$code.=<<___;
+-.globl	poly1305_blocks
+-.type	poly1305_blocks,\@function
+-.align	16
+-poly1305_blocks:
+-	srl${g}	$len,4			# fixed-up in 64-bit build
+-	lghi	%r0,0
+-	cl${g}r	$len,%r0
+-	je	.Lno_data
+-
+-	stm${g}	%r6,%r14,`6*$SIZE_T`($sp)
+-
+-	llgfr   $padbit,$padbit		# clear upper half, much needed with
+-					# non-64-bit ABI
+-	lg	$r0,32($ctx)		# load key
+-	lg	$r1,40($ctx)
+-
+-	lg	$h0,0($ctx)		# load hash value
+-	lg	$h1,8($ctx)
+-	lg	$h2,16($ctx)
+-
+-	st$g	$ctx,`2*$SIZE_T`($sp)	# off-load $ctx
+-	srlg	$s1,$r1,2
+-	algr	$s1,$r1			# s1 = r1 + r1>>2
+-	j	.Loop
+-
+-.align	16
+-.Loop:
+-	lrvg	$d0lo,0($inp)		# load little-endian input
+-	lrvg	$d1lo,8($inp)
+-	la	$inp,16($inp)
+-
+-	algr	$d0lo,$h0		# accumulate input
+-	alcgr	$d1lo,$h1
+-
+-	lgr	$h0,$d0lo
+-	mlgr	$d0hi,$r0		# h0*r0	  -> $d0hi:$d0lo
+-	lgr	$h1,$d1lo
+-	mlgr	$d1hi,$s1		# h1*5*r1 -> $d1hi:$d1lo
+-
+-	mlgr	$t0,$r1			# h0*r1   -> $t0:$h0
+-	mlgr	$t1,$r0			# h1*r0   -> $t1:$h1
+-	alcgr	$h2,$padbit
+-
+-	algr	$d0lo,$d1lo
+-	lgr	$d1lo,$h2
+-	alcgr	$d0hi,$d1hi
+-	lghi	$d1hi,0
+-
+-	algr	$h1,$h0
+-	alcgr	$t1,$t0
+-
+-	msgr	$d1lo,$s1		# h2*s1
+-	msgr	$h2,$r0			# h2*r0
+-
+-	algr	$h1,$d1lo
+-	alcgr	$t1,$d1hi		# $d1hi is zero
+-
+-	algr	$h1,$d0hi
+-	alcgr	$h2,$t1
+-
+-	lghi	$h0,-4			# final reduction step
+-	ngr	$h0,$h2
+-	srlg	$t0,$h2,2
+-	algr	$h0,$t0
+-	lghi	$t1,3
+-	ngr	$h2,$t1
+-
+-	algr	$h0,$d0lo
+-	alcgr	$h1,$d1hi		# $d1hi is still zero
+-	alcgr	$h2,$d1hi		# $d1hi is still zero
+-
+-	brct$g	$len,.Loop
+-
+-	l$g	$ctx,`2*$SIZE_T`($sp)	# restore $ctx
+-
+-	stg	$h0,0($ctx)		# store hash value
+-	stg	$h1,8($ctx)
+-	stg	$h2,16($ctx)
+-
+-	lm${g}	%r6,%r14,`6*$SIZE_T`($sp)
+-.Lno_data:
+-	br	%r14
+-.size	poly1305_blocks,.-poly1305_blocks
+-___
++GLOBL	("poly1305_blocks");
++TYPE	("poly1305_blocks","\@function");
++ALIGN	(16);
++LABEL	("poly1305_blocks");
++LABEL	(".Lpoly1305_blocks");
++&{$z?	\&ltgr:\&ltr}	("%r0",$len);
++	jz	(".Lno_data");
++
++&{$z?	\&stmg:\&stm}	("%r6","%r14","6*$SIZE_T($sp)");
++
++	lg	($h0,"0($ctx)");		# load hash value
++	lg	($h1,"8($ctx)");
++	lg	($h2,"16($ctx)");
++
++LABEL	(".Lpoly1305_blocks_entry");
++if ($z) {
++	srlg	($len,$len,4);
++} else {
++	srl	($len,4);
++}
++	llgfr   ($padbit,$padbit);		# clear upper half, much needed with
++						# non-64-bit ABI
++	lg	($r0,"32($ctx)");		# load key
++	lg	($r1,"40($ctx)");
++
++&{$z?	\&stg:\&st}	($ctx,"2*$SIZE_T($sp)");	# off-load $ctx
++	srlg	($s1,$r1,2);
++	algr	($s1,$r1);			# s1 = r1 + r1>>2
++	j	(".Loop");
++
++ALIGN	(16);
++LABEL	(".Loop");
++	lrvg	($d0lo,"0($inp)");		# load little-endian input
++	lrvg	($d1lo,"8($inp)");
++	la	($inp,"16($inp)");
++
++	algr	($d0lo,$h0);			# accumulate input
++	alcgr	($d1lo,$h1);
++	alcgr	($h2,$padbit);
++
++	lgr	($h0,$d0lo);
++	mlgr	($d0hi,$r0);			# h0*r0	  -> $d0hi:$d0lo
++	lgr	($h1,$d1lo);
++	mlgr	($d1hi,$s1);			# h1*5*r1 -> $d1hi:$d1lo
++
++	mlgr	($t0,$r1);			# h0*r1   -> $t0:$h0
++	mlgr	($t1,$r0);			# h1*r0   -> $t1:$h1
++
++	algr	($d0lo,$d1lo);
++	lgr	($d1lo,$h2);
++	alcgr	($d0hi,$d1hi);
++	lghi	($d1hi,0);
++
++	algr	($h1,$h0);
++	alcgr	($t1,$t0);
++
++	msgr	($d1lo,$s1);			# h2*s1
++	msgr	($h2,$r0);			# h2*r0
++
++	algr	($h1,$d1lo);
++	alcgr	($t1,$d1hi);			# $d1hi is zero
++
++	algr	($h1,$d0hi);
++	alcgr	($h2,$t1);
++
++	lghi	($h0,-4);			# final reduction step
++	ngr	($h0,$h2);
++	srlg	($t0,$h2,2);
++	algr	($h0,$t0);
++	lghi	($t1,3);
++	ngr	($h2,$t1);
++
++	algr	($h0,$d0lo);
++	alcgr	($h1,$d1hi);			# $d1hi is still zero
++	alcgr	($h2,$d1hi);			# $d1hi is still zero
++
++&{$z?	\&brctg:\&brct}	($len,".Loop");
++
++&{$z?	\&lg:\&l}	($ctx,"2*$SIZE_T($sp)");# restore $ctx
++
++	stg	($h0,"0($ctx)");		# store hash value
++	stg	($h1,"8($ctx)");
++	stg	($h2,"16($ctx)");
++
++&{$z?	\&lmg:\&lm}	("%r6","%r14","6*$SIZE_T($sp)");
++LABEL	(".Lno_data");
++	br	("%r14");
++SIZE	("poly1305_blocks",".-poly1305_blocks");
+ }
++
++################
++# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
++#                                size_t len, u32 padbit)
++{
++my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
++my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
++my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
++my      ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
++my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
++my      ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
++my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
++
++my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
++
++TYPE	("poly1305_blocks_vx","\@function");
++ALIGN	(16);
++LABEL	("poly1305_blocks_vx");
++LABEL	(".Lpoly1305_blocks_vx");
++&{$z?	\&clgfi:\&clfi} ($len,128);
++	jhe	("__poly1305_blocks_vx");
++
++&{$z?	\&stmg:\&stm}	("%r6","%r14","6*$SIZE_T($sp)");
++
++	lg	($d0,"0($ctx)");
++	lg	($d1,"8($ctx)");
++	lg	($d2,"16($ctx)");
++
++	llgfr	("%r0",$d0);			# base 2^26 -> base 2^64
++	srlg	($h0,$d0,32);
++	llgfr	("%r1",$d1);
++	srlg	($h1,$d1,32);
++	srlg	($h2,$d2,32);
++
++	sllg	("%r0","%r0",26);
++	algr	($h0,"%r0");
++	sllg	("%r0",$h1,52);
++	srlg	($h1,$h1,12);
++	sllg	("%r1","%r1",14);
++	algr	($h0,"%r0");
++	alcgr	($h1,"%r1");
++	sllg	("%r0",$h2,40);
++	srlg	($h2,$h2,24);
++	lghi	("%r1",0);
++	algr	($h1,"%r0");
++	alcgr	($h2,"%r1");
++
++	llgf	("%r0","24($ctx)");		# is_base2_26
++	lcgr	("%r0","%r0");
++
++	xgr	($h0,$d0);			# choose between radixes
++	xgr	($h1,$d1);
++	xgr	($h2,$d2);
++	ngr	($h0,"%r0");
++	ngr	($h1,"%r0");
++	ngr	($h2,"%r0");
++	xgr	($h0,$d0);
++	xgr	($h1,$d1);
++	xgr	($h2,$d2);
++
++	lhi	("%r0",0);
++	st	("%r0","24($ctx)");		# clear is_base2_26
++
++	j	(".Lpoly1305_blocks_entry");
++SIZE	("poly1305_blocks_vx",".-poly1305_blocks_vx");
++
++TYPE	("__poly1305_mul","\@function");
++ALIGN	(16);
++LABEL	("__poly1305_mul");
++	vmlof		($ACC0,$H0,$R0);
++	vmlof		($ACC1,$H0,$R1);
++	vmlof		($ACC2,$H0,$R2);
++	vmlof		($ACC3,$H0,$R3);
++	vmlof		($ACC4,$H0,$R4);
++
++	vmalof		($ACC0,$H1,$S4,$ACC0);
++	vmalof		($ACC1,$H1,$R0,$ACC1);
++	vmalof		($ACC2,$H1,$R1,$ACC2);
++	vmalof		($ACC3,$H1,$R2,$ACC3);
++	vmalof		($ACC4,$H1,$R3,$ACC4);
++
++	vmalof		($ACC0,$H2,$S3,$ACC0);
++	vmalof		($ACC1,$H2,$S4,$ACC1);
++	vmalof		($ACC2,$H2,$R0,$ACC2);
++	vmalof		($ACC3,$H2,$R1,$ACC3);
++	vmalof		($ACC4,$H2,$R2,$ACC4);
++
++	vmalof		($ACC0,$H3,$S2,$ACC0);
++	vmalof		($ACC1,$H3,$S3,$ACC1);
++	vmalof		($ACC2,$H3,$S4,$ACC2);
++	vmalof		($ACC3,$H3,$R0,$ACC3);
++	vmalof		($ACC4,$H3,$R1,$ACC4);
++
++	vmalof		($ACC0,$H4,$S1,$ACC0);
++	vmalof		($ACC1,$H4,$S2,$ACC1);
++	vmalof		($ACC2,$H4,$S3,$ACC2);
++	vmalof		($ACC3,$H4,$S4,$ACC3);
++	vmalof		($ACC4,$H4,$R0,$ACC4);
++
++	################################################################
++	# lazy reduction
++
++	vesrlg		($H4,$ACC3,26);
++	vesrlg		($H1,$ACC0,26);
++	vn		($H3,$ACC3,$mask26);
++	vn		($H0,$ACC0,$mask26);
++	vag		($H4,$H4,$ACC4);	# h3 -> h4
++	vag		($H1,$H1,$ACC1);	# h0 -> h1
++
++	vesrlg		($ACC4,$H4,26);
++	vesrlg		($ACC1,$H1,26);
++	vn		($H4,$H4,$mask26);
++	vn		($H1,$H1,$mask26);
++	vag		($H0,$H0,$ACC4);
++	vag		($H2,$ACC2,$ACC1);	# h1 -> h2
++
++	veslg		($ACC4,$ACC4,2);	# <<2
++	vesrlg		($ACC2,$H2,26);
++	vn		($H2,$H2,$mask26);
++	vag		($H0,$H0,$ACC4);	# h4 -> h0
++	vag		($H3,$H3,$ACC2);	# h2 -> h3
++
++	vesrlg		($ACC0,$H0,26);
++	vesrlg		($ACC3,$H3,26);
++	vn		($H0,$H0,$mask26);
++	vn		($H3,$H3,$mask26);
++	vag		($H1,$H1,$ACC0);	# h0 -> h1
++	vag		($H4,$H4,$ACC3);	# h3 -> h4
++	br		("%r14");
++SIZE	("__poly1305_mul",".-__poly1305_mul");
++
++TYPE	("__poly1305_blocks_vx","\@function");
++ALIGN	(16);
++LABEL	("__poly1305_blocks_vx");
++&{$z?	\&lgr:\&lr}	("%r0",$sp);
++&{$z?	\&stmg:\&stm}	("%r10","%r15","10*$SIZE_T($sp)");
++if (!$z) {
++	std	("%f4","16*$SIZE_T+2*8($sp)");
++	std	("%f6","16*$SIZE_T+3*8($sp)");
++	ahi	($sp,-$stdframe);
++	st	("%r0","0($sp)");		# back-chain
++
++	llgfr	($len,$len);			# so that srlg works on $len
++} else {
++	aghi	($sp,"-($stdframe+8*8)");
++	stg	("%r0","0($sp)");		# back-chain
++
++	std	("%f8","$stdframe+0*8($sp)");
++	std	("%f9","$stdframe+1*8($sp)");
++	std	("%f10","$stdframe+2*8($sp)");
++	std	("%f11","$stdframe+3*8($sp)");
++	std	("%f12","$stdframe+4*8($sp)");
++	std	("%f13","$stdframe+5*8($sp)");
++	std	("%f14","$stdframe+6*8($sp)");
++	std	("%f15","$stdframe+7*8($sp)");
++}
++	larl	("%r1",".Lconst");
++	vgmg	($mask26,38,63);
++	vlm	($bswaplo,$bswapmi,"16(%r1)");
++
++	&lt	("%r0","24($ctx)");		# is_base2_26?
++	jnz	(".Lskip_init");
++
++	lg	($h0,"32($ctx)");		# load key base 2^64
++	lg	($h1,"40($ctx)");
++
++	risbg	($d0,$h0,38,0x80+63,38);	# base 2^64 -> 2^26
++	srlg	($d1,$h0,52);
++	risbg	($h0,$h0,38,0x80+63,0);
++	vlvgg	($R0,$h0,0);
++	risbg	($d1,$h1,38,51,12);
++	vlvgg	($R1,$d0,0);
++	risbg	($d0,$h1,38,63,50);
++	vlvgg	($R2,$d1,0);
++	srlg	($d1,$h1,40);
++	vlvgg	($R3,$d0,0);
++	vlvgg	($R4,$d1,0);
++
++	veslg	($S1,$R1,2);
++	veslg	($S2,$R2,2);
++	veslg	($S3,$R3,2);
++	veslg	($S4,$R4,2);
++	vlr	($H0,$R0);
++	vlr	($H1,$R1);
++	vlr	($H2,$R2);
++	vlr	($H3,$R3);
++	vlr	($H4,$R4);
++	vag	($S1,$S1,$R1);			# * 5
++	vag	($S2,$S2,$R2);
++	vag	($S3,$S3,$R3);
++	vag	($S4,$S4,$R4);
++
++	brasl	("%r14","__poly1305_mul");	# r^1:- * r^1:-
++
++	vpdi	($R0,$H0,$R0,0);		# r^2:r^1
++	vpdi	($R1,$H1,$R1,0);
++	vpdi	($R2,$H2,$R2,0);
++	vpdi	($R3,$H3,$R3,0);
++	vpdi	($R4,$H4,$R4,0);
++	vpdi	($H0,$H0,$H0,0);		# r^2:r^2
++	vpdi	($H1,$H1,$H1,0);
++	vpdi	($H2,$H2,$H2,0);
++	vpdi	($H3,$H3,$H3,0);
++	vpdi	($H4,$H4,$H4,0);
++	veslg	($S1,$R1,2);
++	veslg	($S2,$R2,2);
++	veslg	($S3,$R3,2);
++	veslg	($S4,$R4,2);
++	vag	($S1,$S1,$R1);			# * 5
++	vag	($S2,$S2,$R2);
++	vag	($S3,$S3,$R3);
++	vag	($S4,$S4,$R4);
++
++	brasl	("%r14,__poly1305_mul");	# r^2:r^2 * r^2:r^1
++
++	vl	($I0,"0(%r1)");			# borrow $I0
++	vperm	($R0,$R0,$H0,$I0);		# r^2:r^4:r^1:r^3
++	vperm	($R1,$R1,$H1,$I0);
++	vperm	($R2,$R2,$H2,$I0);
++	vperm	($R3,$R3,$H3,$I0);
++	vperm	($R4,$R4,$H4,$I0);
++	veslf	($S1,$R1,2);
++	veslf	($S2,$R2,2);
++	veslf	($S3,$R3,2);
++	veslf	($S4,$R4,2);
++	vaf	($S1,$S1,$R1);			# * 5
++	vaf	($S2,$S2,$R2);
++	vaf	($S3,$S3,$R3);
++	vaf	($S4,$S4,$R4);
++
++	lg	($h0,"0($ctx)");		# load hash base 2^64
++	lg	($h1,"8($ctx)");
++	lg	($h2,"16($ctx)");
++
++	vzero	($H0);
++	vzero	($H1);
++	vzero	($H2);
++	vzero	($H3);
++	vzero	($H4);
++
++	risbg	($d0,$h0,38,0x80+63,38);	# base 2^64 -> 2^26
++	srlg	($d1,$h0,52);
++	risbg	($h0,$h0,38,0x80+63,0);
++	vlvgg	($H0,$h0,0);
++	risbg	($d1,$h1,38,51,12);
++	vlvgg	($H1,$d0,0);
++	risbg	($d0,$h1,38,63,50);
++	vlvgg	($H2,$d1,0);
++	srlg	($d1,$h1,40);
++	vlvgg	($H3,$d0,0);
++	risbg	($d1,$h2,37,39,24);
++	vlvgg	($H4,$d1,0);
++
++	lhi	("%r0",1);
++	st	("%r0","24($ctx)");		# set is_base2_26
++
++	vstm	($R0,$S4,"48($ctx)");		# save key schedule base 2^26
++
++	vpdi	($R0,$R0,$R0,0);		# broadcast r^2:r^4
++	vpdi	($R1,$R1,$R1,0);
++	vpdi	($S1,$S1,$S1,0);
++	vpdi	($R2,$R2,$R2,0);
++	vpdi	($S2,$S2,$S2,0);
++	vpdi	($R3,$R3,$R3,0);
++	vpdi	($S3,$S3,$S3,0);
++	vpdi	($R4,$R4,$R4,0);
++	vpdi	($S4,$S4,$S4,0);
++
++	j	(".Loaded_hash");
++
++ALIGN	(16);
++LABEL	(".Lskip_init");
++	vllezf	($H0,"0($ctx)");		# load hash base 2^26
++	vllezf	($H1,"4($ctx)");
++	vllezf	($H2,"8($ctx)");
++	vllezf	($H3,"12($ctx)");
++	vllezf	($H4,"16($ctx)");
++
++	vlrepg	($R0,"0x30($ctx)");		# broadcast r^2:r^4
++	vlrepg	($R1,"0x40($ctx)");
++	vlrepg	($S1,"0x50($ctx)");
++	vlrepg	($R2,"0x60($ctx)");
++	vlrepg	($S2,"0x70($ctx)");
++	vlrepg	($R3,"0x80($ctx)");
++	vlrepg	($S3,"0x90($ctx)");
++	vlrepg	($R4,"0xa0($ctx)");
++	vlrepg	($S4,"0xb0($ctx)");
++
++LABEL	(".Loaded_hash");
++	vzero	($I1);
++	vzero	($I3);
++
++	vlm	($T1,$T4,"0x00($inp)");		# load first input block
++	la	($inp,"0x40($inp)");
++	vgmg	($mask26,6,31);
++	vgmf	($I4,5,5);			# padbit<<2
++
++	vperm	($I0,$T3,$T4,$bswaplo);
++	vperm	($I2,$T3,$T4,$bswapmi);
++	vperm	($T3,$T3,$T4,$bswaphi);
++
++	verimg	($I1,$I0,$mask26,6);		# >>26
++	veslg	($I0,$I0,32);
++	veslg	($I2,$I2,28);			# >>4
++	verimg	($I3,$T3,$mask26,18);		# >>14
++	verimg	($I4,$T3,$mask26,58);		# >>38
++	vn	($I0,$I0,$mask26);
++	vn	($I2,$I2,$mask26);
++	vesrlf	($I4,$I4,2);			# >>2
++
++	vgmg	($mask26,38,63);
++	vperm	($T3,$T1,$T2,$bswaplo);
++	vperm	($T4,$T1,$T2,$bswaphi);
++	vperm	($T2,$T1,$T2,$bswapmi);
++
++	verimg	($I0,$T3,$mask26,0);
++	verimg	($I1,$T3,$mask26,38);		# >>26
++	verimg	($I2,$T2,$mask26,60);		# >>4
++	verimg	($I3,$T4,$mask26,50);		# >>14
++	vesrlg	($T4,$T4,40);
++	vo	($I4,$I4,$T4);
++
++	srlg	("%r0",$len,6);
++&{$z?	\&aghi:\&ahi}	("%r0",-1);
++
++ALIGN	(16);
++LABEL	(".Loop_vx");
++	vmlef		($ACC0,$I0,$R0);
++	vmlef		($ACC1,$I0,$R1);
++	vmlef		($ACC2,$I0,$R2);
++	vmlef		($ACC3,$I0,$R3);
++	vmlef		($ACC4,$I0,$R4);
++
++	vmalef		($ACC0,$I1,$S4,$ACC0);
++	vmalef		($ACC1,$I1,$R0,$ACC1);
++	vmalef		($ACC2,$I1,$R1,$ACC2);
++	vmalef		($ACC3,$I1,$R2,$ACC3);
++	vmalef		($ACC4,$I1,$R3,$ACC4);
++
++	 vaf		($H2,$H2,$I2);
++	 vaf		($H0,$H0,$I0);
++	 vaf		($H3,$H3,$I3);
++	 vaf		($H1,$H1,$I1);
++	 vaf		($H4,$H4,$I4);
++
++	vmalef		($ACC0,$I2,$S3,$ACC0);
++	vmalef		($ACC1,$I2,$S4,$ACC1);
++	vmalef		($ACC2,$I2,$R0,$ACC2);
++	vmalef		($ACC3,$I2,$R1,$ACC3);
++	vmalef		($ACC4,$I2,$R2,$ACC4);
++
++	 vlm		($T1,$T4,"0x00($inp)");	# load next input block
++	 la		($inp,"0x40($inp)");
++	 vgmg		($mask26,6,31);
++
++	vmalef		($ACC0,$I3,$S2,$ACC0);
++	vmalef		($ACC1,$I3,$S3,$ACC1);
++	vmalef		($ACC2,$I3,$S4,$ACC2);
++	vmalef		($ACC3,$I3,$R0,$ACC3);
++	vmalef		($ACC4,$I3,$R1,$ACC4);
++
++	 vperm		($I0,$T3,$T4,$bswaplo);
++	 vperm		($I2,$T3,$T4,$bswapmi);
++	 vperm		($T3,$T3,$T4,$bswaphi);
++
++	vmalef		($ACC0,$I4,$S1,$ACC0);
++	vmalef		($ACC1,$I4,$S2,$ACC1);
++	vmalef		($ACC2,$I4,$S3,$ACC2);
++	vmalef		($ACC3,$I4,$S4,$ACC3);
++	vmalef		($ACC4,$I4,$R0,$ACC4);
++
++	 verimg		($I1,$I0,$mask26,6);	# >>26
++	 veslg		($I0,$I0,32);
++	 veslg		($I2,$I2,28);		# >>4
++	 verimg		($I3,$T3,$mask26,18);	# >>14
++
++	vmalof		($ACC0,$H0,$R0,$ACC0);
++	vmalof		($ACC1,$H0,$R1,$ACC1);
++	vmalof		($ACC2,$H0,$R2,$ACC2);
++	vmalof		($ACC3,$H0,$R3,$ACC3);
++	vmalof		($ACC4,$H0,$R4,$ACC4);
++
++	 vgmf		($I4,5,5);		# padbit<<2
++	 verimg		($I4,$T3,$mask26,58);	# >>38
++	 vn		($I0,$I0,$mask26);
++	 vn		($I2,$I2,$mask26);
++	 vesrlf		($I4,$I4,2);		# >>2
++
++	vmalof		($ACC0,$H1,$S4,$ACC0);
++	vmalof		($ACC1,$H1,$R0,$ACC1);
++	vmalof		($ACC2,$H1,$R1,$ACC2);
++	vmalof		($ACC3,$H1,$R2,$ACC3);
++	vmalof		($ACC4,$H1,$R3,$ACC4);
++
++	 vgmg		($mask26,38,63);
++	 vperm		($T3,$T1,$T2,$bswaplo);
++	 vperm		($T4,$T1,$T2,$bswaphi);
++	 vperm		($T2,$T1,$T2,$bswapmi);
++
++	vmalof		($ACC0,$H2,$S3,$ACC0);
++	vmalof		($ACC1,$H2,$S4,$ACC1);
++	vmalof		($ACC2,$H2,$R0,$ACC2);
++	vmalof		($ACC3,$H2,$R1,$ACC3);
++	vmalof		($ACC4,$H2,$R2,$ACC4);
++
++	 verimg		($I0,$T3,$mask26,0);
++	 verimg		($I1,$T3,$mask26,38);	# >>26
++	 verimg		($I2,$T2,$mask26,60);	# >>4
++
++	vmalof		($ACC0,$H3,$S2,$ACC0);
++	vmalof		($ACC1,$H3,$S3,$ACC1);
++	vmalof		($ACC2,$H3,$S4,$ACC2);
++	vmalof		($ACC3,$H3,$R0,$ACC3);
++	vmalof		($ACC4,$H3,$R1,$ACC4);
++
++	 verimg		($I3,$T4,$mask26,50);	# >>14
++	 vesrlg		($T4,$T4,40);
++	 vo		($I4,$I4,$T4);
++
++	vmalof		($ACC0,$H4,$S1,$ACC0);
++	vmalof		($ACC1,$H4,$S2,$ACC1);
++	vmalof		($ACC2,$H4,$S3,$ACC2);
++	vmalof		($ACC3,$H4,$S4,$ACC3);
++	vmalof		($ACC4,$H4,$R0,$ACC4);
++
++	################################################################
++	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++	# and P. Schwabe
++
++	vesrlg		($H4,$ACC3,26);
++	vesrlg		($H1,$ACC0,26);
++	vn		($H3,$ACC3,$mask26);
++	vn		($H0,$ACC0,$mask26);
++	vag		($H4,$H4,$ACC4);	# h3 -> h4
++	vag		($H1,$H1,$ACC1);	# h0 -> h1
++
++	vesrlg		($ACC4,$H4,26);
++	vesrlg		($ACC1,$H1,26);
++	vn		($H4,$H4,$mask26);
++	vn		($H1,$H1,$mask26);
++	vag		($H0,$H0,$ACC4);
++	vag		($H2,$ACC2,$ACC1);	# h1 -> h2
++
++	veslg		($ACC4,$ACC4,2);	# <<2
++	vesrlg		($ACC2,$H2,26);
++	vn		($H2,$H2,$mask26);
++	vag		($H0,$H0,$ACC4);	# h4 -> h0
++	vag		($H3,$H3,$ACC2);	# h2 -> h3
++
++	vesrlg		($ACC0,$H0,26);
++	vesrlg		($ACC3,$H3,26);
++	vn		($H0,$H0,$mask26);
++	vn		($H3,$H3,$mask26);
++	vag		($H1,$H1,$ACC0);	# h0 -> h1
++	vag		($H4,$H4,$ACC3);	# h3 -> h4
++
++&{$z?	\&brctg:\&brct}	("%r0",".Loop_vx");
++
++	vlm	($R0,$S4,"48($ctx)");		# load all powers
++
++	lghi	("%r0",0x30);
++&{$z?	\&lcgr:\&lcr}	($len,$len);
++&{$z?	\&ngr:\&nr}	($len,"%r0");
++&{$z?	\&slgr:\&slr}	($inp,$len);
++
++LABEL	(".Last");
++	vmlef	($ACC0,$I0,$R0);
++	vmlef	($ACC1,$I0,$R1);
++	vmlef	($ACC2,$I0,$R2);
++	vmlef	($ACC3,$I0,$R3);
++	vmlef	($ACC4,$I0,$R4);
++
++	vmalef	($ACC0,$I1,$S4,$ACC0);
++	vmalef	($ACC1,$I1,$R0,$ACC1);
++	vmalef	($ACC2,$I1,$R1,$ACC2);
++	vmalef	($ACC3,$I1,$R2,$ACC3);
++	vmalef	($ACC4,$I1,$R3,$ACC4);
++
++	 vaf	($H0,$H0,$I0);
++	 vaf	($H1,$H1,$I1);
++	 vaf	($H2,$H2,$I2);
++	 vaf	($H3,$H3,$I3);
++	 vaf	($H4,$H4,$I4);
++
++	vmalef	($ACC0,$I2,$S3,$ACC0);
++	vmalef	($ACC1,$I2,$S4,$ACC1);
++	vmalef	($ACC2,$I2,$R0,$ACC2);
++	vmalef	($ACC3,$I2,$R1,$ACC3);
++	vmalef	($ACC4,$I2,$R2,$ACC4);
++
++	vmalef	($ACC0,$I3,$S2,$ACC0);
++	vmalef	($ACC1,$I3,$S3,$ACC1);
++	vmalef	($ACC2,$I3,$S4,$ACC2);
++	vmalef	($ACC3,$I3,$R0,$ACC3);
++	vmalef	($ACC4,$I3,$R1,$ACC4);
++
++	vmalef	($ACC0,$I4,$S1,$ACC0);
++	vmalef	($ACC1,$I4,$S2,$ACC1);
++	vmalef	($ACC2,$I4,$S3,$ACC2);
++	vmalef	($ACC3,$I4,$S4,$ACC3);
++	vmalef	($ACC4,$I4,$R0,$ACC4);
++
++	vmalof	($ACC0,$H0,$R0,$ACC0);
++	vmalof	($ACC1,$H0,$R1,$ACC1);
++	vmalof	($ACC2,$H0,$R2,$ACC2);
++	vmalof	($ACC3,$H0,$R3,$ACC3);
++	vmalof	($ACC4,$H0,$R4,$ACC4);
++
++	vmalof	($ACC0,$H1,$S4,$ACC0);
++	vmalof	($ACC1,$H1,$R0,$ACC1);
++	vmalof	($ACC2,$H1,$R1,$ACC2);
++	vmalof	($ACC3,$H1,$R2,$ACC3);
++	vmalof	($ACC4,$H1,$R3,$ACC4);
++
++	vmalof	($ACC0,$H2,$S3,$ACC0);
++	vmalof	($ACC1,$H2,$S4,$ACC1);
++	vmalof	($ACC2,$H2,$R0,$ACC2);
++	vmalof	($ACC3,$H2,$R1,$ACC3);
++	vmalof	($ACC4,$H2,$R2,$ACC4);
++
++	vmalof	($ACC0,$H3,$S2,$ACC0);
++	vmalof	($ACC1,$H3,$S3,$ACC1);
++	vmalof	($ACC2,$H3,$S4,$ACC2);
++	vmalof	($ACC3,$H3,$R0,$ACC3);
++	vmalof	($ACC4,$H3,$R1,$ACC4);
++
++	vmalof	($ACC0,$H4,$S1,$ACC0);
++	vmalof	($ACC1,$H4,$S2,$ACC1);
++	vmalof	($ACC2,$H4,$S3,$ACC2);
++	vmalof	($ACC3,$H4,$S4,$ACC3);
++	vmalof	($ACC4,$H4,$R0,$ACC4);
++
++	################################################################
++	# horizontal addition
++
++	vzero	($H0);
++	vsumqg	($ACC0,$ACC0,$H0);
++	vsumqg	($ACC1,$ACC1,$H0);
++	vsumqg	($ACC2,$ACC2,$H0);
++	vsumqg	($ACC3,$ACC3,$H0);
++	vsumqg	($ACC4,$ACC4,$H0);
++
++	################################################################
++	# lazy reduction
++
++	vesrlg	($H4,$ACC3,26);
++	vesrlg	($H1,$ACC0,26);
++	vn	($H3,$ACC3,$mask26);
++	vn	($H0,$ACC0,$mask26);
++	vag	($H4,$H4,$ACC4);		# h3 -> h4
++	vag	($H1,$H1,$ACC1);		# h0 -> h1
++
++	vesrlg	($ACC4,$H4,26);
++	vesrlg	($ACC1,$H1,26);
++	vn	($H4,$H4,$mask26);
++	vn	($H1,$H1,$mask26);
++	vag	($H0,$H0,$ACC4);
++	vag	($H2,$ACC2,$ACC1);		# h1 -> h2
++
++	veslg	($ACC4,$ACC4,2);		# <<2
++	vesrlg	($ACC2,$H2,26);
++	vn	($H2,$H2,$mask26);
++	vag	($H0,$H0,$ACC4);		# h4 -> h0
++	vag	($H3,$H3,$ACC2);		# h2 -> h3
++
++	vesrlg	($ACC0,$H0,26);
++	vesrlg	($ACC3,$H3,26);
++	vn	($H0,$H0,$mask26);
++	vn	($H3,$H3,$mask26);
++	vag	($H1,$H1,$ACC0);		# h0 -> h1
++	vag	($H4,$H4,$ACC3);		# h3 -> h4
++
++&{$z?	\&clgfi:\&clfi} ($len,0);
++	je	(".Ldone");
++
++	vlm	($T1,$T4,"0x00($inp)");		# load last partial block
++	vgmg	($mask26,6,31);
++	vgmf	($I4,5,5);			# padbit<<2
++
++	vperm	($I0,$T3,$T4,$bswaplo);
++	vperm	($I2,$T3,$T4,$bswapmi);
++	vperm	($T3,$T3,$T4,$bswaphi);
++
++	vl	($ACC0,"0x30($len,%r1)");	# borrow $ACC0,1
++	vl	($ACC1,"0x60($len,%r1)");
++
++	verimg	($I1,$I0,$mask26,6);		# >>26
++	veslg	($I0,$I0,32);
++	veslg	($I2,$I2,28);			# >>4
++	verimg	($I3,$T3,$mask26,18);		# >>14
++	verimg	($I4,$T3,$mask26,58);		# >>38
++	vn	($I0,$I0,$mask26);
++	vn	($I2,$I2,$mask26);
++	vesrlf	($I4,$I4,2);			# >>2
++
++	vgmg	($mask26,38,63);
++	vperm	($T3,$T1,$T2,$bswaplo);
++	vperm	($T4,$T1,$T2,$bswaphi);
++	vperm	($T2,$T1,$T2,$bswapmi);
++
++	verimg	($I0,$T3,$mask26,0);
++	verimg	($I1,$T3,$mask26,38);		# >>26
++	verimg	($I2,$T2,$mask26,60);		# >>4
++	verimg	($I3,$T4,$mask26,50);		# >>14
++	vesrlg	($T4,$T4,40);
++	vo	($I4,$I4,$T4);
++
++	vperm	($H0,$H0,$H0,$ACC0);		# move hash to right lane
++	vn	($I0,$I0,$ACC1);		# mask redundant lane[s]
++	vperm	($H1,$H1,$H1,$ACC0);
++	vn	($I1,$I1,$ACC1);
++	vperm	($H2,$H2,$H2,$ACC0);
++	vn	($I2,$I2,$ACC1);
++	vperm	($H3,$H3,$H3,$ACC0);
++	vn	($I3,$I3,$ACC1);
++	vperm	($H4,$H4,$H4,$ACC0);
++	vn	($I4,$I4,$ACC1);
++
++	vaf	($I0,$I0,$H0);			# accumulate hash
++	vzero	($H0);				# wipe hash value
++	vaf	($I1,$I1,$H1);
++	vzero	($H1);
++	vaf	($I2,$I2,$H2);
++	vzero	($H2);
++	vaf	($I3,$I3,$H3);
++	vzero	($H3);
++	vaf	($I4,$I4,$H4);
++	vzero	($H4);
++
++&{$z?	\&lghi:\&lhi}	($len,0);
++	j	(".Last");
++	# I don't bother to tell apart cases when only one multiplication
++	# pass is sufficient, because I argue that mispredicted branch
++	# penalties are comparable to overhead of sometimes redundant
++	# multiplication pass...
++
++LABEL	(".Ldone");
++	vstef	($H0,"0($ctx)",3);		# store hash base 2^26
++	vstef	($H1,"4($ctx)",3);
++	vstef	($H2,"8($ctx)",3);
++	vstef	($H3,"12($ctx)",3);
++	vstef	($H4,"16($ctx)",3);
++
++if ($z) {
++	ld	("%f8","$stdframe+0*8($sp)");
++	ld	("%f9","$stdframe+1*8($sp)");
++	ld	("%f10","$stdframe+2*8($sp)");
++	ld	("%f11","$stdframe+3*8($sp)");
++	ld	("%f12","$stdframe+4*8($sp)");
++	ld	("%f13","$stdframe+5*8($sp)");
++	ld	("%f14","$stdframe+6*8($sp)");
++	ld	("%f15","$stdframe+7*8($sp)");
++&{$z?	\&lmg:\&lm}	("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
++} else {
++	ld	("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
++	ld	("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
++&{$z?	\&lmg:\&lm}	("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
++}
++	br	("%r14");
++SIZE	("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
++}
++
++################
++# static void poly1305_emit(void *ctx, unsigned char mac[16],
++#                           const u32 nonce[4])
+ {
+ my ($mac,$nonce)=($inp,$len);
+-my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
++my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
+ 
+-$code.=<<___;
+-.globl	poly1305_emit
+-.type	poly1305_emit,\@function
+-.align	16
+-poly1305_emit:
+-	stm${g}	%r6,%r9,`6*$SIZE_T`($sp)
+-
+-	lg	$h0,0($ctx)
+-	lg	$h1,8($ctx)
+-	lg	$h2,16($ctx)
+-
+-	lghi	%r0,5
+-	lghi	%r1,0
+-	lgr	$d0,$h0
+-	lgr	$d1,$h1
+-
+-	algr	$h0,%r0			# compare to modulus
+-	alcgr	$h1,%r1
+-	alcgr	$h2,%r1
+-
+-	srlg	$h2,$h2,2		# did it borrow/carry?
+-	slgr	%r1,$h2			# 0-$h2>>2
+-	lg	$h2,0($nonce)		# load nonce
+-	lghi	%r0,-1
+-	lg	$ctx,8($nonce)
+-	xgr	%r0,%r1			# ~%r1
+-
+-	ngr	$h0,%r1
+-	ngr	$d0,%r0
+-	ngr	$h1,%r1
+-	ngr	$d1,%r0
+-	ogr	$h0,$d0
+-	rllg	$d0,$h2,32		# flip nonce words
+-	ogr	$h1,$d1
+-	rllg	$d1,$ctx,32
+-
+-	algr	$h0,$d0			# accumulate nonce
+-	alcgr	$h1,$d1
+-
+-	strvg	$h0,0($mac)		# write little-endian result
+-	strvg	$h1,8($mac)
+-
+-	lm${g}	%r6,%r9,`6*$SIZE_T`($sp)
+-	br	%r14
+-.size	poly1305_emit,.-poly1305_emit
+-
+-.string	"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+-___
++GLOBL	("poly1305_emit");
++TYPE	("poly1305_emit","\@function");
++ALIGN	(16);
++LABEL	("poly1305_emit");
++LABEL	(".Lpoly1305_emit");
++&{$z?	\&stmg:\&stm}	("%r6","%r10","6*$SIZE_T($sp)");
++
++	lg	($d0,"0($ctx)");
++	lg	($d1,"8($ctx)");
++	lg	($d2,"16($ctx)");
++
++	llgfr	("%r0",$d0);			# base 2^26 -> base 2^64
++	srlg	($h0,$d0,32);
++	llgfr	("%r1",$d1);
++	srlg	($h1,$d1,32);
++	srlg	($h2,$d2,32);
++
++	sllg	("%r0","%r0",26);
++	algr	($h0,"%r0");
++	sllg	("%r0",$h1,52);
++	srlg	($h1,$h1,12);
++	sllg	("%r1","%r1",14);
++	algr	($h0,"%r0");
++	alcgr	($h1,"%r1");
++	sllg	("%r0",$h2,40);
++	srlg	($h2,$h2,24);
++	lghi	("%r1",0);
++	algr	($h1,"%r0");
++	alcgr	($h2,"%r1");
++
++	llgf	("%r0","24($ctx)");		# is_base2_26
++	lcgr	("%r0","%r0");
++
++	xgr	($h0,$d0);			# choose between radixes
++	xgr	($h1,$d1);
++	xgr	($h2,$d2);
++	ngr	($h0,"%r0");
++	ngr	($h1,"%r0");
++	ngr	($h2,"%r0");
++	xgr	($h0,$d0);
++	xgr	($h1,$d1);
++	xgr	($h2,$d2);
++
++	lghi	("%r0",5);
++	lgr	($d0,$h0);
++	lgr	($d1,$h1);
++
++	algr	($h0,"%r0");			# compare to modulus
++	alcgr	($h1,"%r1");
++	alcgr	($h2,"%r1");
++
++	srlg	($h2,$h2,2);			# did it borrow/carry?
++	slgr	("%r1",$h2);				# 0-$h2>>2
++	lg	($d2,"0($nonce)");		# load nonce
++	lg	($ctx,"8($nonce)");
++
++	xgr	($h0,$d0);
++	xgr	($h1,$d1);
++	ngr	($h0,"%r1");
++	ngr	($h1,"%r1");
++	xgr	($h0,$d0);
++	rllg	($d0,$d2,32);			# flip nonce words
++	xgr	($h1,$d1);
++	rllg	($d1,$ctx,32);
++
++	algr	($h0,$d0);			# accumulate nonce
++	alcgr	($h1,$d1);
++
++	strvg	($h0,"0($mac)");		# write little-endian result
++	strvg	($h1,"8($mac)");
++
++&{$z?	\&lmg:\&lm}	("%r6","%r10","6*$SIZE_T($sp)");
++	br	("%r14");
++SIZE	("poly1305_emit",".-poly1305_emit");
+ }
+ 
+-$code =~ s/\`([^\`]*)\`/eval $1/gem;
+-$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
++################
++
++ALIGN	(16);
++LABEL	(".Lconst");
++LONG	(0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f);	# merge odd
++LONG	(0x07060504,0x03020100,0x17161514,0x13121110);	# byte swap masks
++LONG	(0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
++LONG	(0x00000000,0x09080706,0x00000000,0x19181716);
++
++LONG	(0x00000000,0x00000000,0x00000000,0x0c0d0e0f);	# magic tail masks
++LONG	(0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
++LONG	(0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
++
++LONG	(0xffffffff,0x00000000,0xffffffff,0xffffffff);
++LONG	(0xffffffff,0x00000000,0xffffffff,0x00000000);
++LONG	(0x00000000,0x00000000,0xffffffff,0x00000000);
++
++STRING	("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
+ 
+-print $code;
+-close STDOUT;
++PERLASM_END();
+diff -up openssl-1.1.1b/crypto/poly1305/build.info.s390x-update openssl-1.1.1b/crypto/poly1305/build.info
+--- openssl-1.1.1b/crypto/poly1305/build.info.s390x-update	2019-05-06 10:54:00.036367588 +0200
++++ openssl-1.1.1b/crypto/poly1305/build.info	2019-05-06 10:56:14.964105164 +0200
+@@ -18,6 +18,7 @@ INCLUDE[poly1305-armv8.o]=..
+ GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME)
+ INCLUDE[poly1305-mips.o]=..
+ GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME)
++INCLUDE[poly1305-s390x.o]=..
+ 
+ BEGINRAW[Makefile(unix)]
+ {- $builddir -}/poly1305-%.S:	{- $sourcedir -}/asm/poly1305-%.pl
diff --git a/openssl.spec b/openssl.spec
index 5c0c014..2e3cbdb 100644
--- a/openssl.spec
+++ b/openssl.spec
@@ -22,7 +22,7 @@
 Summary: Utilities from the general purpose cryptography library with TLS implementation
 Name: openssl
 Version: 1.1.1b
-Release: 6%{?dist}
+Release: 7%{?dist}
 Epoch: 1
 # We have to remove certain patented algorithms from the openssl source
 # tarball with the hobble-openssl script which is included below.
@@ -63,6 +63,7 @@ Patch49: openssl-1.1.1-evp-kdf.patch
 Patch50: openssl-1.1.1-ssh-kdf.patch
 # Backported fixes including security fixes
 Patch51: openssl-1.1.1-upstream-sync.patch
+Patch52: openssl-1.1.1-s390x-update.patch
 
 License: OpenSSL
 URL: http://www.openssl.org/
@@ -160,6 +161,7 @@ cp %{SOURCE13} test/
 %patch49 -p1 -b .evp-kdf
 %patch50 -p1 -b .ssh-kdf
 %patch51 -p1 -b .upstream-sync
+%patch52 -p1 -b .s390x-update
 
 
 %build
@@ -446,6 +448,9 @@ export LD_LIBRARY_PATH
 %ldconfig_scriptlets libs
 
 %changelog
+* Mon May  6 2019 Tomáš Mráz <tmraz@redhat.com> 1.1.1b-7
+- add S390x chacha20-poly1305 assembler support from master branch
+
 * Fri May  3 2019 Tomáš Mráz <tmraz@redhat.com> 1.1.1b-6
 - apply new bugfixes from upstream 1.1.1 branch