Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_basecase.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
! 2: dnl
! 3: dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
! 4: dnl unrolling).
! 5:
! 6:
! 7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 8: dnl
! 9: dnl This file is part of the GNU MP Library.
! 10: dnl
! 11: dnl The GNU MP Library is free software; you can redistribute it and/or
! 12: dnl modify it under the terms of the GNU Lesser General Public License as
! 13: dnl published by the Free Software Foundation; either version 2.1 of the
! 14: dnl License, or (at your option) any later version.
! 15: dnl
! 16: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 19: dnl Lesser General Public License for more details.
! 20: dnl
! 21: dnl You should have received a copy of the GNU Lesser General Public
! 22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 24: dnl Suite 330, Boston, MA 02111-1307, USA.
! 25:
! 26:
! 27: include(`../config.m4')
! 28:
! 29:
! 30: dnl K6: UNROLL_COUNT cycles/product (approx)
! 31: dnl 8 9.75
! 32: dnl 16 9.3
! 33: dnl 32 9.3
! 34: dnl Maximum possible with the current code is 32.
! 35: dnl
! 36: dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
! 37: dnl might explain it's good performance.
! 38:
! 39: deflit(UNROLL_COUNT, 16)
! 40:
! 41:
! 42: C void mpn_mul_basecase (mp_ptr wp,
! 43: C mp_srcptr xp, mp_size_t xsize,
! 44: C mp_srcptr yp, mp_size_t ysize);
! 45: C
! 46: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
! 47: C wp,xsize+ysize.
! 48: C
! 49: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
! 50: C it's faster because it does most of the mpn_addmul_1() entry code only
! 51: C once. The saving is about 10-20% on typical sizes coming from the
! 52: C Karatsuba multiply code.
! 53: C
! 54: C Future:
! 55: C
! 56: C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
! 57: C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be
! 58: C 256 bytes.
! 59:
! 60: ifdef(`PIC',`
! 61: deflit(UNROLL_THRESHOLD, 8)
! 62: ',`
! 63: deflit(UNROLL_THRESHOLD, 8)
! 64: ')
! 65:
! 66: defframe(PARAM_YSIZE,20)
! 67: defframe(PARAM_YP, 16)
! 68: defframe(PARAM_XSIZE,12)
! 69: defframe(PARAM_XP, 8)
! 70: defframe(PARAM_WP, 4)
! 71:
! 72: .text
! 73: ALIGN(32)
! 74: PROLOGUE(mpn_mul_basecase)
! 75: deflit(`FRAME',0)
! 76:
! 77: movl PARAM_XSIZE, %ecx
! 78: movl PARAM_YP, %eax
! 79:
! 80: movl PARAM_XP, %edx
! 81: movl (%eax), %eax C yp low limb
! 82:
! 83: cmpl $2, %ecx
! 84: ja L(xsize_more_than_two_limbs)
! 85: je L(two_by_something)
! 86:
! 87:
! 88: C one limb by one limb
! 89:
! 90: movl (%edx), %edx C xp low limb
! 91: movl PARAM_WP, %ecx
! 92:
! 93: mull %edx
! 94:
! 95: movl %eax, (%ecx)
! 96: movl %edx, 4(%ecx)
! 97: ret
! 98:
! 99:
! 100: C -----------------------------------------------------------------------------
! 101: L(two_by_something):
! 102: decl PARAM_YSIZE
! 103: pushl %ebx
! 104: deflit(`FRAME',4)
! 105:
! 106: movl PARAM_WP, %ebx
! 107: pushl %esi
! 108: deflit(`FRAME',8)
! 109:
! 110: movl %eax, %ecx C yp low limb
! 111: movl (%edx), %eax C xp low limb
! 112:
! 113: movl %edx, %esi C xp
! 114: jnz L(two_by_two)
! 115:
! 116:
! 117: C two limbs by one limb
! 118:
! 119: mull %ecx
! 120:
! 121: movl %eax, (%ebx)
! 122: movl 4(%esi), %eax
! 123:
! 124: movl %edx, %esi C carry
! 125:
! 126: mull %ecx
! 127:
! 128: addl %eax, %esi
! 129: movl %esi, 4(%ebx)
! 130:
! 131: adcl $0, %edx
! 132:
! 133: movl %edx, 8(%ebx)
! 134: popl %esi
! 135:
! 136: popl %ebx
! 137: ret
! 138:
! 139:
! 140:
! 141: C -----------------------------------------------------------------------------
! 142: ALIGN(16)
! 143: L(two_by_two):
! 144: C eax xp low limb
! 145: C ebx wp
! 146: C ecx yp low limb
! 147: C edx
! 148: C esi xp
! 149: C edi
! 150: C ebp
! 151: deflit(`FRAME',8)
! 152:
! 153: mull %ecx C xp[0] * yp[0]
! 154:
! 155: push %edi
! 156: deflit(`FRAME',12)
! 157: movl %eax, (%ebx)
! 158:
! 159: movl 4(%esi), %eax
! 160: movl %edx, %edi C carry, for wp[1]
! 161:
! 162: mull %ecx C xp[1] * yp[0]
! 163:
! 164: addl %eax, %edi
! 165: movl PARAM_YP, %ecx
! 166:
! 167: adcl $0, %edx
! 168:
! 169: movl %edi, 4(%ebx)
! 170: movl 4(%ecx), %ecx C yp[1]
! 171:
! 172: movl 4(%esi), %eax C xp[1]
! 173: movl %edx, %edi C carry, for wp[2]
! 174:
! 175: mull %ecx C xp[1] * yp[1]
! 176:
! 177: addl %eax, %edi
! 178:
! 179: adcl $0, %edx
! 180:
! 181: movl (%esi), %eax C xp[0]
! 182: movl %edx, %esi C carry, for wp[3]
! 183:
! 184: mull %ecx C xp[0] * yp[1]
! 185:
! 186: addl %eax, 4(%ebx)
! 187: adcl %edx, %edi
! 188: adcl $0, %esi
! 189:
! 190: movl %edi, 8(%ebx)
! 191: popl %edi
! 192:
! 193: movl %esi, 12(%ebx)
! 194: popl %esi
! 195:
! 196: popl %ebx
! 197: ret
! 198:
! 199:
! 200: C -----------------------------------------------------------------------------
! 201: ALIGN(16)
! 202: L(xsize_more_than_two_limbs):
! 203:
! 204: C The first limb of yp is processed with a simple mpn_mul_1 style loop
! 205: C inline. Unrolling this doesn't seem worthwhile since it's only run once
! 206: C (whereas the addmul below is run ysize-1 many times). A call to the
! 207: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
! 208: C popping, and doesn't seem likely to be worthwhile on the typical 10-20
! 209: C limb operations the Karatsuba code calls here with.
! 210:
! 211: C eax yp[0]
! 212: C ebx
! 213: C ecx xsize
! 214: C edx xp
! 215: C esi
! 216: C edi
! 217: C ebp
! 218: deflit(`FRAME',0)
! 219:
! 220: pushl %edi defframe_pushl(SAVE_EDI)
! 221: pushl %ebp defframe_pushl(SAVE_EBP)
! 222:
! 223: movl PARAM_WP, %edi
! 224: pushl %esi defframe_pushl(SAVE_ESI)
! 225:
! 226: movl %eax, %ebp
! 227: pushl %ebx defframe_pushl(SAVE_EBX)
! 228:
! 229: leal (%edx,%ecx,4), %ebx C xp end
! 230: xorl %esi, %esi
! 231:
! 232: leal (%edi,%ecx,4), %edi C wp end of mul1
! 233: negl %ecx
! 234:
! 235:
! 236: L(mul1):
! 237: C eax scratch
! 238: C ebx xp end
! 239: C ecx counter, negative
! 240: C edx scratch
! 241: C esi carry
! 242: C edi wp end of mul1
! 243: C ebp multiplier
! 244:
! 245: movl (%ebx,%ecx,4), %eax
! 246:
! 247: mull %ebp
! 248:
! 249: addl %esi, %eax
! 250: movl $0, %esi
! 251:
! 252: adcl %edx, %esi
! 253:
! 254: movl %eax, (%edi,%ecx,4)
! 255: incl %ecx
! 256:
! 257: jnz L(mul1)
! 258:
! 259:
! 260: movl PARAM_YSIZE, %edx
! 261: movl %esi, (%edi) C final carry
! 262:
! 263: movl PARAM_XSIZE, %ecx
! 264: decl %edx
! 265:
! 266: jnz L(ysize_more_than_one_limb)
! 267:
! 268: popl %ebx
! 269: popl %esi
! 270: popl %ebp
! 271: popl %edi
! 272: ret
! 273:
! 274:
! 275: L(ysize_more_than_one_limb):
! 276: cmpl $UNROLL_THRESHOLD, %ecx
! 277: movl PARAM_YP, %eax
! 278:
! 279: jae L(unroll)
! 280:
! 281:
! 282: C -----------------------------------------------------------------------------
! 283: C Simple addmul loop.
! 284: C
! 285: C Using ebx and edi pointing at the ends of their respective locations saves
! 286: C a couple of instructions in the outer loop. The inner loop is still 11
! 287: C cycles, the same as the simple loop in aorsmul_1.asm.
! 288:
! 289: C eax yp
! 290: C ebx xp end
! 291: C ecx xsize
! 292: C edx ysize-1
! 293: C esi
! 294: C edi wp end of mul1
! 295: C ebp
! 296:
! 297: movl 4(%eax), %ebp C multiplier
! 298: negl %ecx
! 299:
! 300: movl %ecx, PARAM_XSIZE C -xsize
! 301: xorl %esi, %esi C initial carry
! 302:
! 303: leal 4(%eax,%edx,4), %eax C yp end
! 304: negl %edx
! 305:
! 306: movl %eax, PARAM_YP
! 307: movl %edx, PARAM_YSIZE
! 308:
! 309: jmp L(simple_outer_entry)
! 310:
! 311:
! 312: C aligning here saves a couple of cycles
! 313: ALIGN(16)
! 314: L(simple_outer_top):
! 315: C edx ysize counter, negative
! 316:
! 317: movl PARAM_YP, %eax C yp end
! 318: xorl %esi, %esi C carry
! 319:
! 320: movl PARAM_XSIZE, %ecx C -xsize
! 321: movl %edx, PARAM_YSIZE
! 322:
! 323: movl (%eax,%edx,4), %ebp C yp limb multiplier
! 324: L(simple_outer_entry):
! 325: addl $4, %edi
! 326:
! 327:
! 328: L(simple_inner):
! 329: C eax scratch
! 330: C ebx xp end
! 331: C ecx counter, negative
! 332: C edx scratch
! 333: C esi carry
! 334: C edi wp end of this addmul
! 335: C ebp multiplier
! 336:
! 337: movl (%ebx,%ecx,4), %eax
! 338:
! 339: mull %ebp
! 340:
! 341: addl %esi, %eax
! 342: movl $0, %esi
! 343:
! 344: adcl $0, %edx
! 345: addl %eax, (%edi,%ecx,4)
! 346: adcl %edx, %esi
! 347:
! 348: incl %ecx
! 349: jnz L(simple_inner)
! 350:
! 351:
! 352: movl PARAM_YSIZE, %edx
! 353: movl %esi, (%edi)
! 354:
! 355: incl %edx
! 356: jnz L(simple_outer_top)
! 357:
! 358:
! 359: popl %ebx
! 360: popl %esi
! 361: popl %ebp
! 362: popl %edi
! 363: ret
! 364:
! 365:
! 366: C -----------------------------------------------------------------------------
! 367: C Unrolled loop.
! 368: C
! 369: C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
! 370: C some comments.
! 371: C
! 372: C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
! 373: C 0, inclusive.
! 374: C
! 375: C VAR_JMP is the computed jump into the unrolled loop.
! 376: C
! 377: C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
! 378: C is entered.
! 379: C
! 380: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
! 381: C start of the unrolled loop. This can't just be fetched through the xp
! 382: C pointer because of the offset applied to it.
! 383: C
! 384: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
! 385: C inclusive.
! 386: C
! 387: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
! 388: C added to give the location of the next limb of yp, which is the multiplier
! 389: C in the unrolled loop.
! 390: C
! 391: C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
! 392: C to give the starting point in the destination for each unrolled loop (this
! 393: C point is one limb upwards for each limb of yp processed).
! 394: C
! 395: C Having PARAM_YSIZE count negative to zero means it's not necessary to
! 396: C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
! 397: C the stack remain constant and on each loop an leal adjusts them with the
! 398: C PARAM_YSIZE counter value.
! 399:
! 400:
! 401: defframe(VAR_COUNTER, -20)
! 402: defframe(VAR_COUNTER_INIT, -24)
! 403: defframe(VAR_JMP, -28)
! 404: defframe(VAR_XP_LOW, -32)
! 405: deflit(VAR_STACK_SPACE, 16)
! 406:
! 407: dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
! 408: dnl slower in this code, hence the defframe empty-if-zero feature is
! 409: dnl disabled.
! 410: dnl
! 411: dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
! 412: dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
! 413: dnl anything isn't clear.
! 414: dnl
! 415: define(`defframe_empty_if_zero_disabled',1)
! 416:
! 417: L(unroll):
! 418: C eax yp (not used)
! 419: C ebx xp end (not used)
! 420: C ecx xsize
! 421: C edx ysize-1
! 422: C esi
! 423: C edi wp end of mul1 (not used)
! 424: C ebp
! 425: deflit(`FRAME', 16)
! 426:
! 427: leal -2(%ecx), %ebp C one limb processed at start,
! 428: decl %ecx C and ebp is one less
! 429:
! 430: shrl $UNROLL_LOG2, %ebp
! 431: negl %ecx
! 432:
! 433: subl $VAR_STACK_SPACE, %esp
! 434: deflit(`FRAME', 16+VAR_STACK_SPACE)
! 435: andl $UNROLL_MASK, %ecx
! 436:
! 437: movl %ecx, %esi
! 438: shll $4, %ecx
! 439:
! 440: movl %ebp, VAR_COUNTER_INIT
! 441: negl %esi
! 442:
! 443: C 15 code bytes per limb
! 444: ifdef(`PIC',`
! 445: call L(pic_calc)
! 446: L(unroll_here):
! 447: ',`
! 448: leal L(unroll_entry) (%ecx,%esi,1), %ecx
! 449: ')
! 450:
! 451: movl PARAM_XP, %ebx
! 452: movl %ebp, VAR_COUNTER
! 453:
! 454: movl PARAM_WP, %edi
! 455: movl %ecx, VAR_JMP
! 456:
! 457: movl (%ebx), %eax
! 458: leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
! 459:
! 460: leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
! 461:
! 462: movl %eax, VAR_XP_LOW
! 463:
! 464: movl %ebx, PARAM_XP
! 465: movl PARAM_YP, %ebx
! 466:
! 467: leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
! 468: movl 4(%ebx), %ebp C multiplier (yp second limb)
! 469:
! 470: leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
! 471:
! 472: movl %ecx, PARAM_WP
! 473:
! 474: leal 1(%esi), %ecx C adjust parity for decl %ecx above
! 475:
! 476: movl %ebx, PARAM_YP
! 477: negl %edx
! 478:
! 479: movl %edx, PARAM_YSIZE
! 480: jmp L(unroll_outer_entry)
! 481:
! 482:
! 483: ifdef(`PIC',`
! 484: L(pic_calc):
! 485: C See README.family about old gas bugs
! 486: leal (%ecx,%esi,1), %ecx
! 487: addl $L(unroll_entry)-L(unroll_here), %ecx
! 488: addl (%esp), %ecx
! 489: ret
! 490: ')
! 491:
! 492:
! 493: C -----------------------------------------------------------------------------
! 494: C Aligning here saves a couple of cycles per loop. Using 32 doesn't
! 495: C cost any extra space, since the inner unrolled loop below is
! 496: C aligned to 32.
! 497: ALIGN(32)
! 498: L(unroll_outer_top):
! 499: C edx ysize
! 500:
! 501: movl PARAM_YP, %eax
! 502: movl %edx, PARAM_YSIZE C incremented ysize counter
! 503:
! 504: movl PARAM_WP, %edi
! 505:
! 506: movl VAR_COUNTER_INIT, %ebx
! 507: movl (%eax,%edx,4), %ebp C next multiplier
! 508:
! 509: movl PARAM_XSIZE, %ecx
! 510: leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
! 511:
! 512: movl VAR_XP_LOW, %eax
! 513: movl %ebx, VAR_COUNTER
! 514:
! 515: L(unroll_outer_entry):
! 516: mull %ebp
! 517:
! 518: C using testb is a tiny bit faster than testl
! 519: testb $1, %cl
! 520:
! 521: movl %eax, %ecx C low carry
! 522: movl VAR_JMP, %eax
! 523:
! 524: movl %edx, %esi C high carry
! 525: movl PARAM_XP, %ebx
! 526:
! 527: jnz L(unroll_noswap)
! 528: movl %ecx, %esi C high,low carry other way around
! 529:
! 530: movl %edx, %ecx
! 531: L(unroll_noswap):
! 532:
! 533: jmp *%eax
! 534:
! 535:
! 536:
! 537: C -----------------------------------------------------------------------------
! 538: ALIGN(32)
! 539: L(unroll_top):
! 540: C eax scratch
! 541: C ebx xp
! 542: C ecx carry low
! 543: C edx scratch
! 544: C esi carry high
! 545: C edi wp
! 546: C ebp multiplier
! 547: C VAR_COUNTER loop counter
! 548: C
! 549: C 15 code bytes each limb
! 550:
! 551: leal UNROLL_BYTES(%edi), %edi
! 552:
! 553: L(unroll_entry):
! 554: deflit(CHUNK_COUNT,2)
! 555: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 556: deflit(`disp0', eval(i*CHUNK_COUNT*4))
! 557: deflit(`disp1', eval(disp0 + 4))
! 558: deflit(`disp2', eval(disp1 + 4))
! 559:
! 560: movl disp1(%ebx), %eax
! 561: mull %ebp
! 562: Zdisp( addl, %ecx, disp0,(%edi))
! 563: adcl %eax, %esi
! 564: movl %edx, %ecx
! 565: jadcl0( %ecx)
! 566:
! 567: movl disp2(%ebx), %eax
! 568: mull %ebp
! 569: addl %esi, disp1(%edi)
! 570: adcl %eax, %ecx
! 571: movl %edx, %esi
! 572: jadcl0( %esi)
! 573: ')
! 574:
! 575: decl VAR_COUNTER
! 576: leal UNROLL_BYTES(%ebx), %ebx
! 577:
! 578: jns L(unroll_top)
! 579:
! 580:
! 581: movl PARAM_YSIZE, %edx
! 582: addl %ecx, UNROLL_BYTES(%edi)
! 583:
! 584: adcl $0, %esi
! 585:
! 586: incl %edx
! 587: movl %esi, UNROLL_BYTES+4(%edi)
! 588:
! 589: jnz L(unroll_outer_top)
! 590:
! 591:
! 592: movl SAVE_ESI, %esi
! 593: movl SAVE_EBP, %ebp
! 594: movl SAVE_EDI, %edi
! 595: movl SAVE_EBX, %ebx
! 596:
! 597: addl $FRAME, %esp
! 598: ret
! 599:
! 600: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>