Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/rshift.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K6-2 mpn_rshift -- mpn right shift.
! 2: dnl
! 3: dnl K6-2: 1.75 cycles/limb
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 30: C unsigned shift);
! 31: C
! 32:
! 33: defframe(PARAM_SHIFT,16)
! 34: defframe(PARAM_SIZE, 12)
! 35: defframe(PARAM_SRC, 8)
! 36: defframe(PARAM_DST, 4)
! 37: deflit(`FRAME',0)
! 38:
! 39: dnl Minimum 9, because the unrolled loop can't handle less.
! 40: dnl
! 41: deflit(UNROLL_THRESHOLD, 9)
! 42:
! 43: .text
! 44: ALIGN(32)
! 45:
! 46: PROLOGUE(mpn_rshift)
! 47: deflit(`FRAME',0)
! 48:
! 49: C The 1 limb case can be done without the push %ebx, but it's then
! 50: C still the same speed. The push is left as a free helping hand for
! 51: C the two_or_more code.
! 52:
! 53: movl PARAM_SIZE, %eax
! 54: pushl %ebx FRAME_pushl()
! 55:
! 56: movl PARAM_SRC, %ebx
! 57: decl %eax
! 58:
! 59: movl PARAM_SHIFT, %ecx
! 60: jnz L(two_or_more)
! 61:
! 62: movl (%ebx), %edx C src limb
! 63: movl PARAM_DST, %ebx
! 64:
! 65: shrdl( %cl, %edx, %eax) C return value
! 66:
! 67: shrl %cl, %edx
! 68:
! 69: movl %edx, (%ebx) C dst limb
! 70: popl %ebx
! 71:
! 72: ret
! 73:
! 74:
! 75: C -----------------------------------------------------------------------------
! 76: ALIGN(16) C avoid offset 0x1f
! 77: L(two_or_more):
! 78: C eax size-1
! 79: C ebx src
! 80: C ecx shift
! 81: C edx
! 82:
! 83: movl (%ebx), %edx C src low limb
! 84: negl %ecx
! 85:
! 86: addl $32, %ecx
! 87: movd PARAM_SHIFT, %mm6
! 88:
! 89: shll %cl, %edx
! 90: cmpl $UNROLL_THRESHOLD-1, %eax
! 91:
! 92: jae L(unroll)
! 93:
! 94:
! 95: C eax size-1
! 96: C ebx src
! 97: C ecx 32-shift
! 98: C edx retval
! 99: C
! 100: C mm6 shift
! 101:
! 102: movl PARAM_DST, %ecx
! 103: leal (%ebx,%eax,4), %ebx
! 104:
! 105: leal -4(%ecx,%eax,4), %ecx
! 106: negl %eax
! 107:
! 108: C This loop runs at about 3 cycles/limb, which is the amount of
! 109: C decoding, and this is despite every second access being unaligned.
! 110:
! 111: L(simple):
! 112: C eax counter, -(size-1) to -1
! 113: C ebx &src[size-1]
! 114: C ecx &dst[size-1]
! 115: C edx retval
! 116: C
! 117: C mm0 scratch
! 118: C mm6 shift
! 119:
! 120: Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
! 121: incl %eax
! 122:
! 123: psrlq %mm6, %mm0
! 124:
! 125: Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
! 126: jnz L(simple)
! 127:
! 128:
! 129: movq %mm0, (%ecx)
! 130: movl %edx, %eax
! 131:
! 132: popl %ebx
! 133:
! 134: femms
! 135: ret
! 136:
! 137:
! 138: C -----------------------------------------------------------------------------
! 139: ALIGN(16)
! 140: L(unroll):
! 141: C eax size-1
! 142: C ebx src
! 143: C ecx 32-shift
! 144: C edx retval
! 145: C
! 146: C mm6 shift
! 147:
! 148: addl $32, %ecx
! 149: subl $7, %eax C size-8
! 150:
! 151: movd %ecx, %mm7
! 152: movl PARAM_DST, %ecx
! 153:
! 154: movq (%ebx), %mm2 C src low qword
! 155: leal (%ebx,%eax,4), %ebx C src end - 32
! 156:
! 157: testb $4, %cl
! 158: leal (%ecx,%eax,4), %ecx C dst end - 32
! 159:
! 160: notl %eax C -(size-7)
! 161: jz L(dst_aligned)
! 162:
! 163: psrlq %mm6, %mm2
! 164: incl %eax
! 165:
! 166: Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
! 167: movq 4(%ebx,%eax,4), %mm2 C new src low qword
! 168: L(dst_aligned):
! 169:
! 170: movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
! 171: nop C avoid bad cache line crossing
! 172:
! 173:
! 174: C This loop is the important bit, the rest is just support for it.
! 175: C Four src limbs are held at the start, and four more will be read.
! 176: C Four dst limbs will be written. This schedule seems necessary for
! 177: C full speed.
! 178: C
! 179: C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
! 180: C and leaves 0 to 3 which can be tested with test $1 and $2.
! 181:
! 182: L(top):
! 183: C eax counter, -(size-7) step by +4 until >=0
! 184: C ebx src end - 32
! 185: C ecx dst end - 32
! 186: C edx retval
! 187: C
! 188: C mm0 src next qword
! 189: C mm1 scratch
! 190: C mm2 src prev qword
! 191: C mm6 shift
! 192: C mm7 64-shift
! 193:
! 194: psrlq %mm6, %mm2
! 195: addl $4, %eax
! 196:
! 197: movq %mm0, %mm1
! 198: psllq %mm7, %mm0
! 199:
! 200: por %mm0, %mm2
! 201: movq 4(%ebx,%eax,4), %mm0
! 202:
! 203: psrlq %mm6, %mm1
! 204: movq %mm2, -12(%ecx,%eax,4)
! 205:
! 206: movq %mm0, %mm2
! 207: psllq %mm7, %mm0
! 208:
! 209: por %mm0, %mm1
! 210: movq 12(%ebx,%eax,4), %mm0
! 211:
! 212: movq %mm1, -4(%ecx,%eax,4)
! 213: ja L(top) C jump if no carry and not zero
! 214:
! 215:
! 216:
! 217: C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
! 218: C to 3 representing respectively 3 to 0 further limbs.
! 219:
! 220: testl $2, %eax C testl to avoid bad cache line crossings
! 221: jnz L(finish_nottwo)
! 222:
! 223: C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
! 224: C becomes new mm2 and a new mm0 is loaded.
! 225:
! 226: psrlq %mm6, %mm2
! 227: movq %mm0, %mm1
! 228:
! 229: psllq %mm7, %mm0
! 230: addl $2, %eax
! 231:
! 232: por %mm0, %mm2
! 233: movq 12(%ebx,%eax,4), %mm0
! 234:
! 235: movq %mm2, -4(%ecx,%eax,4)
! 236: movq %mm1, %mm2
! 237: L(finish_nottwo):
! 238:
! 239:
! 240: testb $1, %al
! 241: psrlq %mm6, %mm2
! 242:
! 243: movq %mm0, %mm1
! 244: psllq %mm7, %mm0
! 245:
! 246: por %mm0, %mm2
! 247: psrlq %mm6, %mm1
! 248:
! 249: movq %mm2, 4(%ecx,%eax,4)
! 250: jnz L(finish_even)
! 251:
! 252:
! 253: C one further extra limb to process
! 254:
! 255: movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
! 256: popl %ebx
! 257:
! 258: movq %mm0, %mm2
! 259: psllq %mm7, %mm0
! 260:
! 261: por %mm0, %mm1
! 262: psrlq %mm6, %mm2
! 263:
! 264: movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
! 265: movd %mm2, 32-4(%ecx) C dst[size-1]
! 266:
! 267: movl %edx, %eax C retval
! 268:
! 269: femms
! 270: ret
! 271:
! 272:
! 273: nop C avoid bad cache line crossing
! 274: L(finish_even):
! 275: C no further extra limbs
! 276:
! 277: movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
! 278: movl %edx, %eax C retval
! 279:
! 280: popl %ebx
! 281:
! 282: femms
! 283: ret
! 284:
! 285: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>