Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/mod_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
! 2: dnl
! 3: dnl K7: 17.0 cycles/limb.
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
! 30: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 31: C mp_limb_t carry);
! 32: C
! 33: C The code here is the same as mpn_divrem_1, but with the quotient
! 34: C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
! 35:
! 36:
! 37: dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
! 38: dnl used, rather than plain "divl"s. Minimum value 2.
! 39: dnl
! 40: dnl The inverse takes about 50 cycles to calculate, but after that the
! 41: dnl multiply is 17 c/l versus division at 41 c/l.
! 42: dnl
! 43: dnl Using mul or div is about the same speed at 3 limbs, so the threshold
! 44: dnl is set to 4 to get the smaller div code used at 3.
! 45:
! 46: deflit(MUL_THRESHOLD, 4)
! 47:
! 48:
! 49: defframe(PARAM_CARRY, 16)
! 50: defframe(PARAM_DIVISOR,12)
! 51: defframe(PARAM_SIZE, 8)
! 52: defframe(PARAM_SRC, 4)
! 53:
! 54: defframe(SAVE_EBX, -4)
! 55: defframe(SAVE_ESI, -8)
! 56: defframe(SAVE_EDI, -12)
! 57: defframe(SAVE_EBP, -16)
! 58:
! 59: defframe(VAR_NORM, -20)
! 60: defframe(VAR_INVERSE, -24)
! 61: defframe(VAR_SRC_STOP,-28)
! 62:
! 63: deflit(STACK_SPACE, 28)
! 64:
! 65: .text
! 66: ALIGN(32)
! 67:
! 68: PROLOGUE(mpn_mod_1c)
! 69: deflit(`FRAME',0)
! 70: movl PARAM_CARRY, %edx
! 71: movl PARAM_SIZE, %ecx
! 72: subl $STACK_SPACE, %esp
! 73: deflit(`FRAME',STACK_SPACE)
! 74:
! 75: movl %ebp, SAVE_EBP
! 76: movl PARAM_DIVISOR, %ebp
! 77:
! 78: movl %esi, SAVE_ESI
! 79: movl PARAM_SRC, %esi
! 80: jmp LF(mpn_mod_1,start_1c)
! 81:
! 82: EPILOGUE()
! 83:
! 84:
! 85: ALIGN(32)
! 86: PROLOGUE(mpn_mod_1)
! 87: deflit(`FRAME',0)
! 88:
! 89: movl PARAM_SIZE, %ecx
! 90: movl $0, %edx C initial carry (if can't skip a div)
! 91: subl $STACK_SPACE, %esp
! 92: deflit(`FRAME',STACK_SPACE)
! 93:
! 94: movl %esi, SAVE_ESI
! 95: movl PARAM_SRC, %esi
! 96:
! 97: movl %ebp, SAVE_EBP
! 98: movl PARAM_DIVISOR, %ebp
! 99:
! 100: orl %ecx, %ecx
! 101: jz L(divide_done)
! 102:
! 103: movl -4(%esi,%ecx,4), %eax C src high limb
! 104:
! 105: cmpl %ebp, %eax C carry flag if high<divisor
! 106:
! 107: cmovc( %eax, %edx) C src high limb as initial carry
! 108: sbbl $0, %ecx C size-1 to skip one div
! 109: jz L(divide_done)
! 110:
! 111:
! 112: ALIGN(16)
! 113: L(start_1c):
! 114: C eax
! 115: C ebx
! 116: C ecx size
! 117: C edx carry
! 118: C esi src
! 119: C edi
! 120: C ebp divisor
! 121:
! 122: cmpl $MUL_THRESHOLD, %ecx
! 123: jae L(mul_by_inverse)
! 124:
! 125:
! 126:
! 127: C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
! 128: C but it's already fast and compact, and there's nothing to gain by
! 129: C expanding it out.
! 130: C
! 131: C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
! 132:
! 133: orl %ecx, %ecx
! 134: jz L(divide_done)
! 135:
! 136:
! 137: L(divide_top):
! 138: C eax scratch (quotient)
! 139: C ebx
! 140: C ecx counter, limbs, decrementing
! 141: C edx scratch (remainder)
! 142: C esi src
! 143: C edi
! 144: C ebp
! 145:
! 146: movl -4(%esi,%ecx,4), %eax
! 147:
! 148: divl PARAM_DIVISOR
! 149:
! 150: decl %ecx
! 151: jnz L(divide_top)
! 152:
! 153:
! 154: L(divide_done):
! 155: movl SAVE_ESI, %esi
! 156: movl SAVE_EBP, %ebp
! 157: addl $STACK_SPACE, %esp
! 158:
! 159: movl %edx, %eax
! 160:
! 161: ret
! 162:
! 163:
! 164:
! 165: C -----------------------------------------------------------------------------
! 166:
! 167: L(mul_by_inverse):
! 168: C eax
! 169: C ebx
! 170: C ecx size
! 171: C edx carry
! 172: C esi src
! 173: C edi
! 174: C ebp divisor
! 175:
! 176: bsrl %ebp, %eax C 31-l
! 177:
! 178: movl %ebx, SAVE_EBX
! 179: leal -4(%esi), %ebx
! 180:
! 181: movl %ebx, VAR_SRC_STOP
! 182: movl %edi, SAVE_EDI
! 183:
! 184: movl %ecx, %ebx C size
! 185: movl $31, %ecx
! 186:
! 187: movl %edx, %edi C carry
! 188: movl $-1, %edx
! 189:
! 190: C
! 191:
! 192: xorl %eax, %ecx C l
! 193: incl %eax C 32-l
! 194:
! 195: shll %cl, %ebp C d normalized
! 196: movl %ecx, VAR_NORM
! 197:
! 198: movd %eax, %mm7
! 199:
! 200: movl $-1, %eax
! 201: subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
! 202:
! 203: divl %ebp C floor (b*(b-d)-1) / d
! 204:
! 205: C
! 206:
! 207: movl %eax, VAR_INVERSE
! 208: leal -12(%esi,%ebx,4), %eax C &src[size-3]
! 209:
! 210: movl 8(%eax), %esi C src high limb
! 211: movl 4(%eax), %edx C src second highest limb
! 212:
! 213: shldl( %cl, %esi, %edi) C n2 = carry,high << l
! 214:
! 215: shldl( %cl, %edx, %esi) C n10 = high,second << l
! 216:
! 217: movl %eax, %ecx C &src[size-3]
! 218:
! 219:
! 220: ifelse(MUL_THRESHOLD,2,`
! 221: cmpl $2, %ebx
! 222: je L(inverse_two_left)
! 223: ')
! 224:
! 225:
! 226: C The dependent chain here is the same as in mpn_divrem_1, but a few
! 227: C instructions are saved by not needing to store the quotient limbs.
! 228: C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
! 229: C
! 230: C There's four dummy instructions in the loop, all of which are necessary
! 231: C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
! 232: C or changed from load to store or vice versa. They're not completely
! 233: C random, since they correspond to what mpn_divrem_1 has, but there's no
! 234: C obvious reason why they're necessary. Presumably they induce something
! 235: C good in the out of order execution, perhaps through some load/store
! 236: C ordering and/or decoding effects.
! 237: C
! 238: C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
! 239: C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
! 240: C about 13.5 c/l.
! 241:
! 242: ALIGN(32)
! 243: L(inverse_top):
! 244: C eax scratch
! 245: C ebx scratch (nadj, q1)
! 246: C ecx src pointer, decrementing
! 247: C edx scratch
! 248: C esi n10
! 249: C edi n2
! 250: C ebp divisor
! 251: C
! 252: C mm0 scratch (src qword)
! 253: C mm7 rshift for normalization
! 254:
! 255: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
! 256: movl %edi, %eax C n2
! 257: movl PARAM_SIZE, %ebx C dummy
! 258:
! 259: leal (%ebp,%esi), %ebx
! 260: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
! 261: sbbl $-1, %eax C n2+n1
! 262:
! 263: mull VAR_INVERSE C m*(n2+n1)
! 264:
! 265: movq (%ecx), %mm0 C next src limb and the one below it
! 266: subl $4, %ecx
! 267:
! 268: movl %ecx, PARAM_SIZE C dummy
! 269:
! 270: C
! 271:
! 272: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 273: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 274: movl %ebp, %eax C d
! 275:
! 276: C
! 277:
! 278: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 279: jz L(q1_ff)
! 280: nop C dummy
! 281:
! 282: mull %ebx C (q1+1)*d
! 283:
! 284: psrlq %mm7, %mm0
! 285: leal 0(%ecx), %ecx C dummy
! 286:
! 287: C
! 288:
! 289: C
! 290:
! 291: subl %eax, %esi
! 292: movl VAR_SRC_STOP, %eax
! 293:
! 294: C
! 295:
! 296: sbbl %edx, %edi C n - (q1+1)*d
! 297: movl %esi, %edi C remainder -> n2
! 298: leal (%ebp,%esi), %edx
! 299:
! 300: movd %mm0, %esi
! 301:
! 302: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 303: cmpl %eax, %ecx
! 304: jne L(inverse_top)
! 305:
! 306:
! 307: L(inverse_loop_done):
! 308:
! 309:
! 310: C -----------------------------------------------------------------------------
! 311:
! 312: L(inverse_two_left):
! 313: C eax scratch
! 314: C ebx scratch (nadj, q1)
! 315: C ecx &src[-1]
! 316: C edx scratch
! 317: C esi n10
! 318: C edi n2
! 319: C ebp divisor
! 320: C
! 321: C mm0 scratch (src dword)
! 322: C mm7 rshift
! 323:
! 324: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
! 325: movl %edi, %eax C n2
! 326:
! 327: leal (%ebp,%esi), %ebx
! 328: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
! 329: sbbl $-1, %eax C n2+n1
! 330:
! 331: mull VAR_INVERSE C m*(n2+n1)
! 332:
! 333: movd 4(%ecx), %mm0 C src low limb
! 334:
! 335: C
! 336:
! 337: C
! 338:
! 339: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 340: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 341: movl %ebp, %eax C d
! 342:
! 343: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 344:
! 345: sbbl $0, %ebx
! 346:
! 347: mull %ebx C (q1+1)*d
! 348:
! 349: psllq $32, %mm0
! 350:
! 351: psrlq %mm7, %mm0
! 352:
! 353: C
! 354:
! 355: subl %eax, %esi
! 356:
! 357: C
! 358:
! 359: sbbl %edx, %edi C n - (q1+1)*d
! 360: movl %esi, %edi C remainder -> n2
! 361: leal (%ebp,%esi), %edx
! 362:
! 363: movd %mm0, %esi
! 364:
! 365: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 366:
! 367:
! 368: C One limb left
! 369:
! 370: C eax scratch
! 371: C ebx scratch (nadj, q1)
! 372: C ecx
! 373: C edx scratch
! 374: C esi n10
! 375: C edi n2
! 376: C ebp divisor
! 377: C
! 378: C mm0 src limb, shifted
! 379: C mm7 rshift
! 380:
! 381: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
! 382: movl %edi, %eax C n2
! 383:
! 384: leal (%ebp,%esi), %ebx
! 385: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
! 386: sbbl $-1, %eax C n2+n1
! 387:
! 388: mull VAR_INVERSE C m*(n2+n1)
! 389:
! 390: movl VAR_NORM, %ecx C for final denorm
! 391:
! 392: C
! 393:
! 394: C
! 395:
! 396: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 397: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 398: movl %ebp, %eax C d
! 399:
! 400: C
! 401:
! 402: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 403:
! 404: sbbl $0, %ebx
! 405:
! 406: mull %ebx C (q1+1)*d
! 407:
! 408: movl SAVE_EBX, %ebx
! 409:
! 410: C
! 411:
! 412: C
! 413:
! 414: subl %eax, %esi
! 415:
! 416: movl %esi, %eax C remainder
! 417: movl SAVE_ESI, %esi
! 418:
! 419: sbbl %edx, %edi C n - (q1+1)*d
! 420: leal (%ebp,%eax), %edx
! 421: movl SAVE_EBP, %ebp
! 422:
! 423: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
! 424: movl SAVE_EDI, %edi
! 425:
! 426: shrl %cl, %eax C denorm remainder
! 427: addl $STACK_SPACE, %esp
! 428: emms
! 429:
! 430: ret
! 431:
! 432:
! 433: C -----------------------------------------------------------------------------
! 434: C
! 435: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
! 436: C of q*d is simply -d and the remainder n-q*d = n10+d
! 437:
! 438: L(q1_ff):
! 439: C eax (divisor)
! 440: C ebx (q1+1 == 0)
! 441: C ecx src pointer
! 442: C edx
! 443: C esi n10
! 444: C edi (n2)
! 445: C ebp divisor
! 446:
! 447: movl VAR_SRC_STOP, %edx
! 448: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
! 449: psrlq %mm7, %mm0
! 450:
! 451: movd %mm0, %esi C next n10
! 452:
! 453: cmpl %ecx, %edx
! 454: jne L(inverse_top)
! 455: jmp L(inverse_loop_done)
! 456:
! 457: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>