Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mod_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel P5 mpn_mod_1 -- mpn by limb remainder.
! 2:
! 3: dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C P5: 28.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
! 29: C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 30: C mp_limb_t carry);
! 31: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 32: C mp_limb_t inverse);
! 33: C
! 34: C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of
! 35: C multiply by inverse without on-the-fly shifts. See that code for some
! 36: C general comments.
! 37: C
! 38: C Alternatives:
! 39: C
! 40: C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles
! 41: C slower, probably more depending what it did to register usage. Using MMX
! 42: C on P55 would be better, but still at least 4 or 5 instructions and so 2 or
! 43: C 3 cycles.
! 44:
! 45:
! 46: dnl These thresholds are the sizes where the multiply by inverse method is
! 47: dnl used, rather than plain "divl"s. Minimum value 2.
! 48: dnl
! 49: dnl MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),
! 50: dnl MUL_UNNORM_THRESHOLD for an unnormalized divisor.
! 51: dnl
! 52: dnl With the divl loop at 44 c/l and the inverse at 28 c/l with about 70
! 53: dnl cycles to setup, the threshold should be about ceil(70/16)==5, which is
! 54: dnl what happens in practice.
! 55: dnl
! 56: dnl An unnormalized divisor gets an extra 40 cycles at the end for the
! 57: dnl final (r*2^n)%(d*2^n) and shift. This increases the threshold by about
! 58: dnl 40/16=3.
! 59: dnl
! 60: dnl PIC adds between 4 and 7 cycles (not sure why it varies), but this
! 61: dnl doesn't change the thresholds.
! 62: dnl
! 63: dnl The entry sequence code that chooses between MUL_NORM_THRESHOLD and
! 64: dnl MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles
! 65: dnl (branch free) and ensures the choice between div or mul is optimal.
! 66:
! 67: deflit(MUL_NORM_THRESHOLD, ifdef(`PIC',5,5))
! 68: deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))
! 69:
! 70: deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
! 71:
! 72:
! 73: defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
! 74: defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
! 75: defframe(PARAM_DIVISOR, 12)
! 76: defframe(PARAM_SIZE, 8)
! 77: defframe(PARAM_SRC, 4)
! 78:
! 79: dnl re-using parameter space
! 80: define(VAR_NORM, `PARAM_DIVISOR')
! 81: define(VAR_INVERSE, `PARAM_SIZE')
! 82:
! 83: TEXT
! 84:
! 85: ALIGN(8)
! 86: PROLOGUE(mpn_preinv_mod_1)
! 87: deflit(`FRAME',0)
! 88:
! 89: pushl %ebp FRAME_pushl()
! 90: pushl %esi FRAME_pushl()
! 91:
! 92: movl PARAM_SRC, %esi
! 93: movl PARAM_SIZE, %edx
! 94:
! 95: pushl %edi FRAME_pushl()
! 96: pushl %ebx FRAME_pushl()
! 97:
! 98: movl PARAM_DIVISOR, %ebp
! 99: movl PARAM_INVERSE, %eax
! 100:
! 101: movl -4(%esi,%edx,4), %edi C src high limb
! 102: leal -8(%esi,%edx,4), %esi C &src[size-2]
! 103:
! 104: movl $0, VAR_NORM
! 105: decl %edx
! 106:
! 107: jnz L(start_preinv)
! 108:
! 109: subl %ebp, %edi C src-divisor
! 110: popl %ebx
! 111:
! 112: sbbl %ecx, %ecx C -1 if underflow
! 113: movl %edi, %eax C src-divisor
! 114:
! 115: andl %ebp, %ecx C d if underflow
! 116: popl %edi
! 117:
! 118: addl %ecx, %eax C remainder, with possible addback
! 119: popl %esi
! 120:
! 121: popl %ebp
! 122:
! 123: ret
! 124:
! 125: EPILOGUE()
! 126:
! 127:
! 128: ALIGN(8)
! 129: PROLOGUE(mpn_mod_1c)
! 130: deflit(`FRAME',0)
! 131:
! 132: movl PARAM_DIVISOR, %eax
! 133: movl PARAM_SIZE, %ecx
! 134:
! 135: sarl $31, %eax C d highbit
! 136: movl PARAM_CARRY, %edx
! 137:
! 138: orl %ecx, %ecx
! 139: jz L(done_edx) C result==carry if size==0
! 140:
! 141: andl $MUL_NORM_DELTA, %eax
! 142: pushl %ebp FRAME_pushl()
! 143:
! 144: addl $MUL_UNNORM_THRESHOLD, %eax C norm or unnorm thresh
! 145: pushl %esi FRAME_pushl()
! 146:
! 147: movl PARAM_SRC, %esi
! 148: movl PARAM_DIVISOR, %ebp
! 149:
! 150: cmpl %eax, %ecx
! 151: jb L(divide_top)
! 152:
! 153: movl %edx, %eax C carry as pretend src high limb
! 154: leal 1(%ecx), %edx C size+1
! 155:
! 156: cmpl $0x1000000, %ebp
! 157: jmp L(mul_by_inverse_1c)
! 158:
! 159: EPILOGUE()
! 160:
! 161:
! 162: ALIGN(8)
! 163: PROLOGUE(mpn_mod_1)
! 164: deflit(`FRAME',0)
! 165:
! 166: movl PARAM_SIZE, %ecx
! 167: pushl %ebp FRAME_pushl()
! 168:
! 169: orl %ecx, %ecx
! 170: jz L(done_zero)
! 171:
! 172: movl PARAM_SRC, %eax
! 173: movl PARAM_DIVISOR, %ebp
! 174:
! 175: sarl $31, %ebp C -1 if divisor normalized
! 176: movl -4(%eax,%ecx,4), %eax C src high limb
! 177:
! 178: movl PARAM_DIVISOR, %edx
! 179: pushl %esi FRAME_pushl()
! 180:
! 181: andl $MUL_NORM_DELTA, %ebp
! 182: cmpl %edx, %eax C carry flag if high<divisor
! 183:
! 184: sbbl %edx, %edx C -1 if high<divisor
! 185: addl $MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh
! 186:
! 187: addl %edx, %ecx C size-1 if high<divisor
! 188: jz L(done_eax)
! 189:
! 190: cmpl %ebp, %ecx
! 191: movl PARAM_DIVISOR, %ebp
! 192:
! 193: movl PARAM_SRC, %esi
! 194: jae L(mul_by_inverse)
! 195:
! 196: andl %eax, %edx C high as initial carry if high<divisor
! 197:
! 198:
! 199: L(divide_top):
! 200: C eax scratch (quotient)
! 201: C ebx
! 202: C ecx counter, limbs, decrementing
! 203: C edx scratch (remainder)
! 204: C esi src
! 205: C edi
! 206: C ebp divisor
! 207:
! 208: movl -4(%esi,%ecx,4), %eax
! 209:
! 210: divl %ebp
! 211:
! 212: decl %ecx
! 213: jnz L(divide_top)
! 214:
! 215:
! 216: popl %esi
! 217: popl %ebp
! 218:
! 219: L(done_edx):
! 220: movl %edx, %eax
! 221:
! 222: ret
! 223:
! 224:
! 225: L(done_zero):
! 226: xorl %eax, %eax
! 227: popl %ebp
! 228:
! 229: ret
! 230:
! 231:
! 232: C -----------------------------------------------------------------------------
! 233: C
! 234: C The divisor is normalized using the same code as the pentium
! 235: C count_leading_zeros in longlong.h. Going through the GOT for PIC costs a
! 236: C couple of cycles, but is more or less unavoidable.
! 237:
! 238:
! 239: ALIGN(8)
! 240: L(mul_by_inverse):
! 241: C eax src high limb
! 242: C ebx
! 243: C ecx size or size-1
! 244: C edx
! 245: C esi src
! 246: C edi
! 247: C ebp divisor
! 248:
! 249: movl PARAM_SIZE, %edx
! 250: cmpl $0x1000000, %ebp
! 251:
! 252: L(mul_by_inverse_1c):
! 253: sbbl %ecx, %ecx
! 254: cmpl $0x10000, %ebp
! 255:
! 256: sbbl $0, %ecx
! 257: cmpl $0x100, %ebp
! 258:
! 259: sbbl $0, %ecx
! 260: pushl %edi FRAME_pushl()
! 261:
! 262: pushl %ebx FRAME_pushl()
! 263: movl %ebp, %ebx C d
! 264:
! 265: ifdef(`PIC',`
! 266: call L(here)
! 267: L(here):
! 268: popl %edi
! 269: leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
! 270:
! 271: shrl %cl, %ebx
! 272: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi
! 273:
! 274: C AGI
! 275: movl __clz_tab@GOT(%edi), %edi
! 276: addl $-34, %ecx
! 277:
! 278: C AGI
! 279: movb (%ebx,%edi), %bl
! 280:
! 281: ',`
! 282: leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
! 283:
! 284: shrl %cl, %ebx
! 285: addl $-34, %ecx
! 286:
! 287: C AGI
! 288: movb __clz_tab(%ebx), %bl
! 289: ')
! 290: movl %eax, %edi C carry -> n1
! 291:
! 292: addl %ebx, %ecx C -34 + c + __clz_tab[d>>c] = -clz-1
! 293: leal -8(%esi,%edx,4), %esi C &src[size-2]
! 294:
! 295: xorl $-1, %ecx C clz
! 296: movl $-1, %edx
! 297:
! 298: ASSERT(e,`pushl %eax C clz calculation same as bsrl
! 299: bsrl %ebp, %eax
! 300: xorl $31, %eax
! 301: cmpl %eax, %ecx
! 302: popl %eax')
! 303:
! 304: shll %cl, %ebp C d normalized
! 305: movl %ecx, VAR_NORM
! 306:
! 307: subl %ebp, %edx C (b-d)-1, so edx:eax = b*(b-d)-1
! 308: movl $-1, %eax
! 309:
! 310: divl %ebp C floor (b*(b-d)-1) / d
! 311:
! 312: L(start_preinv):
! 313: movl %eax, VAR_INVERSE
! 314: movl %ebp, %eax C d
! 315:
! 316: movl %ecx, %edx C fake high, will cancel
! 317:
! 318:
! 319: C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src
! 320: C high limb, and this may be greater than the divisor and may need one copy
! 321: C of the divisor subtracted (only one, because the divisor is normalized).
! 322: C This is accomplished by having the initial ecx:edi act as a fake previous
! 323: C n2:n10. The initial edx:eax is d, acting as a fake (q1+1)*d which is
! 324: C subtracted from ecx:edi, with the usual addback if it produces an
! 325: C underflow.
! 326:
! 327:
! 328: L(inverse_top):
! 329: C eax scratch (n10, n1, q1, etc)
! 330: C ebx scratch (nadj, src limit)
! 331: C ecx old n2
! 332: C edx scratch
! 333: C esi src pointer, &src[size-2] to &src[0]
! 334: C edi old n10
! 335: C ebp d
! 336:
! 337: subl %eax, %edi C low n - (q1+1)*d
! 338: movl (%esi), %eax C new n10
! 339:
! 340: sbbl %edx, %ecx C high n - (q1+1)*d, 0 or -1
! 341: movl %ebp, %ebx C d
! 342:
! 343: sarl $31, %eax C -n1
! 344: andl %ebp, %ecx C d if underflow
! 345:
! 346: addl %edi, %ecx C remainder -> n2, and possible addback
! 347: ASSERT(b,`cmpl %ebp, %ecx')
! 348: andl %eax, %ebx C -n1 & d
! 349:
! 350: movl (%esi), %edi C n10
! 351: andl $1, %eax C n1
! 352:
! 353: addl %ecx, %eax C n2+n1
! 354: addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 355:
! 356: mull VAR_INVERSE C m*(n2+n1)
! 357:
! 358: addl %eax, %ebx C low(m*(n2+n1) + nadj), giving carry flag
! 359: leal 1(%ecx), %eax C 1+n2
! 360:
! 361: adcl %edx, %eax C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1
! 362: movl PARAM_SRC, %ebx
! 363:
! 364: sbbl $0, %eax C use q1 if q1+1 overflows
! 365: subl $4, %esi C step src ptr
! 366:
! 367: mull %ebp C (q1+1)*d
! 368:
! 369: cmpl %ebx, %esi
! 370: jae L(inverse_top)
! 371:
! 372:
! 373:
! 374: C %edi (after subtract and addback) is the remainder modulo d*2^n
! 375: C and must be reduced to 0<=r<d by calculating r*2^n mod d*2^n and
! 376: C right shifting by n.
! 377: C
! 378: C If d was already normalized on entry so that n==0 then nothing is
! 379: C needed here. This is always the case for preinv_mod_1. For mod_1
! 380: C or mod_1c the chance of n==0 is low, but about 40 cycles can be
! 381: C saved.
! 382:
! 383: subl %eax, %edi C low n - (q1+1)*d
! 384: movl %ecx, %ebx C n2
! 385:
! 386: sbbl %edx, %ebx C high n - (q1+1)*d, 0 or -1
! 387: xorl %esi, %esi C next n2
! 388:
! 389: andl %ebp, %ebx C d if underflow
! 390: movl VAR_NORM, %ecx
! 391:
! 392: addl %ebx, %edi C remainder, with possible addback
! 393: orl %ecx, %ecx
! 394:
! 395: jz L(done_mul_edi)
! 396:
! 397:
! 398: C Here using %esi=n2 and %edi=n10, unlike the above
! 399:
! 400: shldl( %cl, %edi, %esi) C n2
! 401:
! 402: shll %cl, %edi C n10
! 403:
! 404: movl %edi, %eax C n10
! 405: movl %edi, %ebx C n10
! 406:
! 407: sarl $31, %ebx C -n1
! 408:
! 409: shrl $31, %eax C n1
! 410: andl %ebp, %ebx C -n1 & d
! 411:
! 412: addl %esi, %eax C n2+n1
! 413: addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 414:
! 415: mull VAR_INVERSE C m*(n2+n1)
! 416:
! 417: addl %eax, %ebx C m*(n2+n1) + nadj, low giving carry flag
! 418: leal 1(%esi), %eax C 1+n2
! 419:
! 420: adcl %edx, %eax C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 421:
! 422: sbbl $0, %eax C use q1 if q1+1 overflows
! 423:
! 424: mull %ebp C (q1+1)*d
! 425:
! 426: subl %eax, %edi C low n - (q1+1)*d
! 427: popl %ebx
! 428:
! 429: sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
! 430: movl %edi, %eax
! 431:
! 432: andl %ebp, %esi C d if underflow
! 433: popl %edi
! 434:
! 435: addl %esi, %eax C addback if underflow
! 436: popl %esi
! 437:
! 438: shrl %cl, %eax C denorm remainder
! 439: popl %ebp
! 440:
! 441: ret
! 442:
! 443:
! 444: L(done_mul_edi):
! 445: movl %edi, %eax
! 446: popl %ebx
! 447:
! 448: popl %edi
! 449: L(done_eax):
! 450: popl %esi
! 451:
! 452: popl %ebp
! 453:
! 454: ret
! 455:
! 456: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>