Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/mod_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
! 2: dnl
! 3: dnl P6MMX: 24.0 cycles/limb.
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
! 30: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 31: C mp_limb_t carry);
! 32: C
! 33: C The code here very similar to mpn_divrem_1, but with the quotient
! 34: C discarded. What's here probably isn't optimal.
! 35: C
! 36: C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
! 37: C comments.
! 38:
! 39:
! 40: dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
! 41: dnl used, rather than plain "divl"s. Minimum value 2.
! 42:
! 43: deflit(MUL_THRESHOLD, 4)
! 44:
! 45:
! 46: defframe(PARAM_CARRY, 16)
! 47: defframe(PARAM_DIVISOR,12)
! 48: defframe(PARAM_SIZE, 8)
! 49: defframe(PARAM_SRC, 4)
! 50:
! 51: defframe(SAVE_EBX, -4)
! 52: defframe(SAVE_ESI, -8)
! 53: defframe(SAVE_EDI, -12)
! 54: defframe(SAVE_EBP, -16)
! 55:
! 56: defframe(VAR_NORM, -20)
! 57: defframe(VAR_INVERSE, -24)
! 58: defframe(VAR_SRC_STOP,-28)
! 59:
! 60: deflit(STACK_SPACE, 28)
! 61:
! 62: .text
! 63: ALIGN(16)
! 64:
! 65: PROLOGUE(mpn_mod_1c)
! 66: deflit(`FRAME',0)
! 67: movl PARAM_CARRY, %edx
! 68: movl PARAM_SIZE, %ecx
! 69: subl $STACK_SPACE, %esp
! 70: deflit(`FRAME',STACK_SPACE)
! 71:
! 72: movl %ebp, SAVE_EBP
! 73: movl PARAM_DIVISOR, %ebp
! 74:
! 75: movl %esi, SAVE_ESI
! 76: movl PARAM_SRC, %esi
! 77: jmp LF(mpn_mod_1,start_1c)
! 78:
! 79: EPILOGUE()
! 80:
! 81:
! 82: ALIGN(16)
! 83: PROLOGUE(mpn_mod_1)
! 84: deflit(`FRAME',0)
! 85:
! 86: movl $0, %edx C initial carry (if can't skip a div)
! 87: movl PARAM_SIZE, %ecx
! 88: subl $STACK_SPACE, %esp
! 89: deflit(`FRAME',STACK_SPACE)
! 90:
! 91: movl %esi, SAVE_ESI
! 92: movl PARAM_SRC, %esi
! 93:
! 94: movl %ebp, SAVE_EBP
! 95: movl PARAM_DIVISOR, %ebp
! 96:
! 97: orl %ecx, %ecx
! 98: jz L(divide_done)
! 99:
! 100: movl -4(%esi,%ecx,4), %eax C src high limb
! 101:
! 102: cmpl %ebp, %eax C carry flag if high<divisor
! 103:
! 104: cmovc( %eax, %edx) C src high limb as initial carry
! 105: sbbl $0, %ecx C size-1 to skip one div
! 106: jz L(divide_done)
! 107:
! 108:
! 109: ALIGN(16)
! 110: L(start_1c):
! 111: C eax
! 112: C ebx
! 113: C ecx size
! 114: C edx carry
! 115: C esi src
! 116: C edi
! 117: C ebp divisor
! 118:
! 119: cmpl $MUL_THRESHOLD, %ecx
! 120: jae L(mul_by_inverse)
! 121:
! 122:
! 123: orl %ecx, %ecx
! 124: jz L(divide_done)
! 125:
! 126:
! 127: L(divide_top):
! 128: C eax scratch (quotient)
! 129: C ebx
! 130: C ecx counter, limbs, decrementing
! 131: C edx scratch (remainder)
! 132: C esi src
! 133: C edi
! 134: C ebp
! 135:
! 136: movl -4(%esi,%ecx,4), %eax
! 137:
! 138: divl %ebp
! 139:
! 140: decl %ecx
! 141: jnz L(divide_top)
! 142:
! 143:
! 144: L(divide_done):
! 145: movl SAVE_ESI, %esi
! 146: movl %edx, %eax
! 147:
! 148: movl SAVE_EBP, %ebp
! 149: addl $STACK_SPACE, %esp
! 150:
! 151: ret
! 152:
! 153:
! 154:
! 155: C -----------------------------------------------------------------------------
! 156:
! 157: L(mul_by_inverse):
! 158: C eax
! 159: C ebx
! 160: C ecx size
! 161: C edx carry
! 162: C esi src
! 163: C edi
! 164: C ebp divisor
! 165:
! 166: movl %ebx, SAVE_EBX
! 167: leal -4(%esi), %ebx
! 168:
! 169: movl %ebx, VAR_SRC_STOP
! 170: movl %ecx, %ebx C size
! 171:
! 172: movl %edi, SAVE_EDI
! 173: movl %edx, %edi C carry
! 174:
! 175: bsrl %ebp, %ecx C 31-l
! 176: movl $-1, %edx
! 177:
! 178: leal 1(%ecx), %eax C 32-l
! 179: xorl $31, %ecx C l
! 180:
! 181: movl %ecx, VAR_NORM
! 182: shll %cl, %ebp C d normalized
! 183:
! 184: movd %eax, %mm7
! 185: movl $-1, %eax
! 186: subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
! 187:
! 188: divl %ebp C floor (b*(b-d)-1) / d
! 189:
! 190: C
! 191:
! 192: movl %eax, VAR_INVERSE
! 193: leal -12(%esi,%ebx,4), %eax C &src[size-3]
! 194:
! 195: movl 8(%eax), %esi C src high limb
! 196: movl 4(%eax), %edx C src second highest limb
! 197:
! 198: shldl( %cl, %esi, %edi) C n2 = carry,high << l
! 199:
! 200: shldl( %cl, %edx, %esi) C n10 = high,second << l
! 201:
! 202: movl %eax, %ecx C &src[size-3]
! 203:
! 204:
! 205: ifelse(MUL_THRESHOLD,2,`
! 206: cmpl $2, %ebx
! 207: je L(inverse_two_left)
! 208: ')
! 209:
! 210:
! 211: C The dependent chain here is the same as in mpn_divrem_1, but a few
! 212: C instructions are saved by not needing to store the quotient limbs. This
! 213: C gets it down to 24 c/l, which is still a bit away from a theoretical 19
! 214: C c/l.
! 215:
! 216: ALIGN(16)
! 217: L(inverse_top):
! 218: C eax scratch
! 219: C ebx scratch (nadj, q1)
! 220: C ecx src pointer, decrementing
! 221: C edx scratch
! 222: C esi n10
! 223: C edi n2
! 224: C ebp divisor
! 225: C
! 226: C mm0 scratch (src qword)
! 227: C mm7 rshift for normalization
! 228:
! 229:
! 230: movl %esi, %eax
! 231: movl %ebp, %ebx
! 232:
! 233: sarl $31, %eax C -n1
! 234:
! 235: andl %eax, %ebx C -n1 & d
! 236: negl %eax C n1
! 237:
! 238: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 239: addl %edi, %eax C n2+n1
! 240:
! 241: mull VAR_INVERSE C m*(n2+n1)
! 242:
! 243: movq (%ecx), %mm0 C next src limb and the one below it
! 244: subl $4, %ecx
! 245:
! 246: C
! 247:
! 248: C
! 249:
! 250: C
! 251:
! 252: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 253: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 254: movl %ebp, %eax C d
! 255:
! 256: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 257: jz L(q1_ff)
! 258:
! 259: mull %ebx C (q1+1)*d
! 260:
! 261: psrlq %mm7, %mm0
! 262: movl VAR_SRC_STOP, %ebx
! 263:
! 264: C
! 265:
! 266: C
! 267:
! 268: C
! 269:
! 270: subl %eax, %esi
! 271:
! 272: sbbl %edx, %edi C n - (q1+1)*d
! 273: movl %esi, %edi C remainder -> n2
! 274: leal (%ebp,%esi), %edx
! 275:
! 276: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 277: movd %mm0, %esi
! 278: cmpl %ebx, %ecx
! 279:
! 280: jne L(inverse_top)
! 281:
! 282:
! 283: L(inverse_loop_done):
! 284:
! 285:
! 286: C -----------------------------------------------------------------------------
! 287:
! 288: L(inverse_two_left):
! 289: C eax scratch
! 290: C ebx scratch (nadj, q1)
! 291: C ecx &src[-1]
! 292: C edx scratch
! 293: C esi n10
! 294: C edi n2
! 295: C ebp divisor
! 296: C
! 297: C mm0 scratch (src dword)
! 298: C mm7 rshift
! 299:
! 300: movl %esi, %eax
! 301: movl %ebp, %ebx
! 302:
! 303: sarl $31, %eax C -n1
! 304:
! 305: andl %eax, %ebx C -n1 & d
! 306: negl %eax C n1
! 307:
! 308: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 309: addl %edi, %eax C n2+n1
! 310:
! 311: mull VAR_INVERSE C m*(n2+n1)
! 312:
! 313: movd 4(%ecx), %mm0 C src low limb
! 314:
! 315: C
! 316:
! 317: C
! 318:
! 319: C
! 320:
! 321: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 322: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 323:
! 324: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 325:
! 326: sbbl $0, %ebx
! 327: movl %ebp, %eax C d
! 328:
! 329: mull %ebx C (q1+1)*d
! 330:
! 331: psllq $32, %mm0
! 332:
! 333: psrlq %mm7, %mm0
! 334:
! 335: C
! 336:
! 337: C
! 338:
! 339: subl %eax, %esi
! 340:
! 341: sbbl %edx, %edi C n - (q1+1)*d
! 342: movl %esi, %edi C remainder -> n2
! 343: leal (%ebp,%esi), %edx
! 344:
! 345: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 346: movd %mm0, %esi
! 347:
! 348:
! 349: C One limb left
! 350:
! 351: C eax scratch
! 352: C ebx scratch (nadj, q1)
! 353: C ecx
! 354: C edx scratch
! 355: C esi n10
! 356: C edi n2
! 357: C ebp divisor
! 358: C
! 359: C mm0 src limb, shifted
! 360: C mm7 rshift
! 361:
! 362: movl %esi, %eax
! 363: movl %ebp, %ebx
! 364:
! 365: sarl $31, %eax C -n1
! 366:
! 367: andl %eax, %ebx C -n1 & d
! 368: negl %eax C n1
! 369:
! 370: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 371: addl %edi, %eax C n2+n1
! 372:
! 373: mull VAR_INVERSE C m*(n2+n1)
! 374:
! 375: movl VAR_NORM, %ecx C for final denorm
! 376:
! 377: C
! 378:
! 379: C
! 380:
! 381: C
! 382:
! 383: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 384: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 385:
! 386: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 387:
! 388: sbbl $0, %ebx
! 389: movl %ebp, %eax C d
! 390:
! 391: mull %ebx C (q1+1)*d
! 392:
! 393: movl SAVE_EBX, %ebx
! 394:
! 395: C
! 396:
! 397: C
! 398:
! 399: C
! 400:
! 401: subl %eax, %esi
! 402:
! 403: sbbl %edx, %edi C n - (q1+1)*d
! 404: leal (%ebp,%esi), %edx
! 405: movl SAVE_EBP, %ebp
! 406:
! 407: movl %esi, %eax C remainder
! 408: movl SAVE_ESI, %esi
! 409:
! 410: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
! 411: movl SAVE_EDI, %edi
! 412:
! 413: shrl %cl, %eax C denorm remainder
! 414: addl $STACK_SPACE, %esp
! 415: emms
! 416:
! 417: ret
! 418:
! 419:
! 420: C -----------------------------------------------------------------------------
! 421: C
! 422: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
! 423: C of q*d is simply -d and the remainder n-q*d = n10+d
! 424:
! 425: L(q1_ff):
! 426: C eax (divisor)
! 427: C ebx (q1+1 == 0)
! 428: C ecx src pointer
! 429: C edx
! 430: C esi n10
! 431: C edi (n2)
! 432: C ebp divisor
! 433:
! 434: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
! 435: movl VAR_SRC_STOP, %edx
! 436: psrlq %mm7, %mm0
! 437:
! 438: movd %mm0, %esi C next n10
! 439: cmpl %ecx, %edx
! 440: jne L(inverse_top)
! 441:
! 442: jmp L(inverse_loop_done)
! 443:
! 444: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>