dnl  IA-64 mpn_Xshift.

dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
dnl  MA 02111-1307, USA.

include(`../config.m4')

C This code runs at 2 cycles/limb for large operands on the Itanium.  It needs
C a very deep software pipeline, since shl/shr.u have a 4 cycle latency.  The
C main loop here is not great; it is oversheduled with respect to the shr.u
C instructions, and this actually turns out to give considerably more complex
C wind down code.  The code runs slowly for operands with <= 8 limbs, since we
C have a non-scheduled loop for that case.  We also have a primitive loop for
C the unrolling edge, and as a consequence of the main loop stupidity it is
C executed 1-4 steps instead of 0-3 steps.

C By having 63 separate loops using the shrp instruction, we could easily reach
C 1 cycle/limb.  Such loops would require a less deep software pipeline, since
C shrp unlike shl/shr.u have a plain one cycle latency.

C INPUT PARAMETERS
C rp = r32
C sp = r33
C n = r34
C cnt = r35

ifdef(`OPERATION_lshift',`
	define(`FSH',`shl')
	define(`BSH',`shr.u')
	define(`UPD',`-8')
	define(`func',`mpn_lshift')
')
ifdef(`OPERATION_rshift',`
	define(`FSH',`shr.u')
	define(`BSH',`shl')
	define(`UPD',`8')
	define(`func',`mpn_rshift')
')

ASM_START()
PROLOGUE(func)
	.prologue
ifdef(`HAVE_ABI_32',
`	addp4	r32 = 0, r32
	addp4	r33 = 0, r33
	sxt4	r34 = r34
	zxt4	r35 = r35
	;;
')
	add	r34 = -1, r34
	sub	r31 = 64, r35
	.save	ar.lc, r2
	mov	r2 = ar.lc;;
	.body
	cmp.leu	p6, p7 = 8,r34
ifdef(`OPERATION_lshift',`
	shladd	r33 = r34, 3, r33
	shladd	r32 = r34, 3, r32;;
')
	ld8	r19 = [r33], UPD	;;
	BSH	r8 = r19, r31		C function return value
   (p6) br.dptk	.Lbig

C
C Code for small operands.  Not an optimization for the Itanium, it is here
C just to simplify the general case.
C
	mov	ar.lc = r34;;
	br.cloop.dptk .Loops
	FSH	r26 = r19, r35	;;
	st8	[r32] = r26
	mov	ar.lc = r2
	br.ret.sptk.many b0
.Loops:
	ld8	r16 = [r33], UPD
	FSH	r26 = r19, r35	;;
	BSH	r27 = r16, r31	;;
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	or	r27 = r27, r26
	mov	r19 = r16	;;
	st8	[r32] = r27, UPD
	br.cloop.dptk .Loops
	FSH	r26 = r19, r35	;;
	st8	[r32] = r26
	mov	ar.lc = r2
	br.ret.sptk.many b0

C
C Code for operands with >8 limbs.  An edge loop and a very deep software
C pipeline.
C
.Lbig:	and	r15 = 3, r34
	shr.u	r14 = r34, 2	;;
	mov	ar.lc = r15
.Loop0:
	ld8	r16 = [r33], UPD
	FSH	r26 = r19, r35	;;
	BSH	r27 = r16, r31	;;
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	{ .mib;	nop.b 0;; }			C delay to save 6 cycles...
	or	r27 = r27, r26
	mov	r19 = r16	;;
	st8	[r32] = r27, UPD
	br.cloop.dptk .Loop0

.Lunroll:
	add	r14 = -2, r14	;;
	mov	ar.lc = r14

.Lphase1:
  { .mmi
	ld8	r16 = [r33], UPD	;;
} { .mmi
	ld8	r17 = [r33], UPD	;;
} { .mmi
	ld8	r18 = [r33], UPD
	FSH	r26 = r19, r35	;;
} { .mmi
	ld8	r19 = [r33], UPD
	BSH	r27 = r16, r31	;;
} { .mib
	FSH	r20 = r16, r35
}

.Lphase2:
  { .mmi
	ld8	r16 = [r33], UPD
	BSH	r21 = r17, r31
} { .mib
	FSH	r22 = r17, r35	;;
} { .mmi
	ld8	r17 = [r33], UPD
	BSH	r23 = r18, r31
} { .mib
	or	r27 = r27, r26
	FSH	r24 = r18, r35
	br.cloop.dptk .Loop
}
	br.sptk	.Lend2
.Loop:
  { .mmi
	st8	[r32] = r27, UPD
	ld8	r18 = [r33], UPD
	BSH	r25 = r19, r31
} { .mib
	or	r21 = r21, r20
	FSH	r26 = r19, r35	;;
} { .mmi
	st8	[r32] = r21, UPD
	ld8	r19 = [r33], UPD
	BSH	r27 = r16, r31
} { .mib
	or	r23 = r23, r22
	FSH	r20 = r16, r35	;;
} { .mmi
	st8	[r32] = r23, UPD
	ld8	r16 = [r33], UPD
	BSH	r21 = r17, r31
} { .mib
	or	r25 = r25, r24
	FSH	r22 = r17, r35	;;
} { .mmi
	st8	[r32] = r25, UPD
	ld8	r17 = [r33], UPD
	BSH	r23 = r18, r31
} { .mib
	or	r27 = r27, r26
	FSH	r24 = r18, r35
	br.cloop.sptk .Loop;;
}
.Lend2:
  { .mmi
	st8	[r32] = r27, UPD
	ld8	r18 = [r33], UPD
	BSH	r25 = r19, r31
} { .mib
	or	r21 = r21, r20
	FSH	r26 = r19, r35	;;
} { .mmi
	st8	[r32] = r21, UPD
	BSH	r27 = r16, r31
} { .mib
	or	r23 = r23, r22
	FSH	r20 = r16, r35	;;
} { .mmi
	st8	[r32] = r23, UPD
	BSH	r21 = r17, r31
} { .mib
	or	r25 = r25, r24
	FSH	r22 = r17, r35	;;
} { .mmi
	st8	[r32] = r25, UPD
	BSH	r23 = r18, r31
} { .mib
	or	r27 = r27, r26
	FSH	r24 = r18, r35	;;
}

  { .mmi
	st8	[r32] = r27, UPD
} { .mib
	or	r21 = r21, r20	;;
} { .mmi
	st8	[r32] = r21, UPD
} { .mib
	or	r23 = r23, r22	;;
} { .mmi
	st8	[r32] = r23, UPD;;
} { .mmi
	st8	[r32] = r24
}
	mov	ar.lc = r2
	br.ret.sptk.many b0
EPILOGUE(func)
ASM_END()