Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_basecase.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
! 2: dnl
! 3: dnl K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
! 4: dnl limbs/loop unrolling).
! 5:
! 6:
! 7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 8: dnl
! 9: dnl This file is part of the GNU MP Library.
! 10: dnl
! 11: dnl The GNU MP Library is free software; you can redistribute it and/or
! 12: dnl modify it under the terms of the GNU Lesser General Public License as
! 13: dnl published by the Free Software Foundation; either version 2.1 of the
! 14: dnl License, or (at your option) any later version.
! 15: dnl
! 16: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 19: dnl Lesser General Public License for more details.
! 20: dnl
! 21: dnl You should have received a copy of the GNU Lesser General Public
! 22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 24: dnl Suite 330, Boston, MA 02111-1307, USA.
! 25:
! 26:
! 27: include(`../config.m4')
! 28:
! 29:
! 30: dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
! 31: dnl 8 4.67
! 32: dnl 16 4.59
! 33: dnl 32 4.42
! 34: dnl Maximum possible with the current code is 32.
! 35: dnl
! 36: dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
! 37: dnl done with a straight run through a block of code, no inner loop. Using
! 38: dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
! 39:
! 40: deflit(UNROLL_COUNT, 32)
! 41:
! 42:
! 43: C void mpn_mul_basecase (mp_ptr wp,
! 44: C mp_srcptr xp, mp_size_t xsize,
! 45: C mp_srcptr yp, mp_size_t ysize);
! 46: C
! 47: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
! 48: C wp,xsize+ysize.
! 49: C
! 50: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
! 51: C it's faster because it does most of the mpn_addmul_1() startup
! 52: C calculations only once. The saving is 15-25% on typical sizes coming from
! 53: C the Karatsuba multiply code.
! 54:
! 55: ifdef(`PIC',`
! 56: deflit(UNROLL_THRESHOLD, 5)
! 57: ',`
! 58: deflit(UNROLL_THRESHOLD, 5)
! 59: ')
! 60:
! 61: defframe(PARAM_YSIZE,20)
! 62: defframe(PARAM_YP, 16)
! 63: defframe(PARAM_XSIZE,12)
! 64: defframe(PARAM_XP, 8)
! 65: defframe(PARAM_WP, 4)
! 66:
! 67: .text
! 68: ALIGN(32)
! 69: PROLOGUE(mpn_mul_basecase)
! 70: deflit(`FRAME',0)
! 71:
! 72: movl PARAM_XSIZE, %ecx
! 73: movl PARAM_YP, %eax
! 74:
! 75: movl PARAM_XP, %edx
! 76: movl (%eax), %eax C yp low limb
! 77:
! 78: cmpl $2, %ecx
! 79: ja L(xsize_more_than_two)
! 80: je L(two_by_something)
! 81:
! 82:
! 83: C one limb by one limb
! 84:
! 85: mull (%edx)
! 86:
! 87: movl PARAM_WP, %ecx
! 88: movl %eax, (%ecx)
! 89: movl %edx, 4(%ecx)
! 90: ret
! 91:
! 92:
! 93: C -----------------------------------------------------------------------------
! 94: L(two_by_something):
! 95: deflit(`FRAME',0)
! 96: decl PARAM_YSIZE
! 97: pushl %ebx defframe_pushl(`SAVE_EBX')
! 98: movl %eax, %ecx C yp low limb
! 99:
! 100: movl PARAM_WP, %ebx
! 101: pushl %esi defframe_pushl(`SAVE_ESI')
! 102: movl %edx, %esi C xp
! 103:
! 104: movl (%edx), %eax C xp low limb
! 105: jnz L(two_by_two)
! 106:
! 107:
! 108: C two limbs by one limb
! 109:
! 110: mull %ecx
! 111:
! 112: movl %eax, (%ebx)
! 113: movl 4(%esi), %eax
! 114: movl %edx, %esi C carry
! 115:
! 116: mull %ecx
! 117:
! 118: addl %eax, %esi
! 119:
! 120: movl %esi, 4(%ebx)
! 121: movl SAVE_ESI, %esi
! 122:
! 123: adcl $0, %edx
! 124:
! 125: movl %edx, 8(%ebx)
! 126: movl SAVE_EBX, %ebx
! 127: addl $FRAME, %esp
! 128:
! 129: ret
! 130:
! 131:
! 132:
! 133: C -----------------------------------------------------------------------------
! 134: C Could load yp earlier into another register.
! 135:
! 136: ALIGN(16)
! 137: L(two_by_two):
! 138: C eax xp low limb
! 139: C ebx wp
! 140: C ecx yp low limb
! 141: C edx
! 142: C esi xp
! 143: C edi
! 144: C ebp
! 145:
! 146: dnl FRAME carries on from previous
! 147:
! 148: mull %ecx C xp[0] * yp[0]
! 149:
! 150: push %edi defframe_pushl(`SAVE_EDI')
! 151: movl %edx, %edi C carry, for wp[1]
! 152:
! 153: movl %eax, (%ebx)
! 154: movl 4(%esi), %eax
! 155:
! 156: mull %ecx C xp[1] * yp[0]
! 157:
! 158: addl %eax, %edi
! 159: movl PARAM_YP, %ecx
! 160:
! 161: adcl $0, %edx
! 162: movl 4(%ecx), %ecx C yp[1]
! 163: movl %edi, 4(%ebx)
! 164:
! 165: movl 4(%esi), %eax C xp[1]
! 166: movl %edx, %edi C carry, for wp[2]
! 167:
! 168: mull %ecx C xp[1] * yp[1]
! 169:
! 170: addl %eax, %edi
! 171:
! 172: adcl $0, %edx
! 173: movl (%esi), %eax C xp[0]
! 174:
! 175: movl %edx, %esi C carry, for wp[3]
! 176:
! 177: mull %ecx C xp[0] * yp[1]
! 178:
! 179: addl %eax, 4(%ebx)
! 180: adcl %edx, %edi
! 181: movl %edi, 8(%ebx)
! 182:
! 183: adcl $0, %esi
! 184: movl SAVE_EDI, %edi
! 185: movl %esi, 12(%ebx)
! 186:
! 187: movl SAVE_ESI, %esi
! 188: movl SAVE_EBX, %ebx
! 189: addl $FRAME, %esp
! 190:
! 191: ret
! 192:
! 193:
! 194: C -----------------------------------------------------------------------------
! 195: ALIGN(16)
! 196: L(xsize_more_than_two):
! 197:
! 198: C The first limb of yp is processed with a simple mpn_mul_1 style loop
! 199: C inline. Unrolling this doesn't seem worthwhile since it's only run once
! 200: C (whereas the addmul below is run ysize-1 many times). A call to the
! 201: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
! 202: C popping, and doesn't seem likely to be worthwhile on the typical 13-26
! 203: C limb operations the Karatsuba code calls here with.
! 204:
! 205: C eax yp[0]
! 206: C ebx
! 207: C ecx xsize
! 208: C edx xp
! 209: C esi
! 210: C edi
! 211: C ebp
! 212:
! 213: dnl FRAME doesn't carry on from previous, no pushes yet here
! 214: defframe(`SAVE_EBX',-4)
! 215: defframe(`SAVE_ESI',-8)
! 216: defframe(`SAVE_EDI',-12)
! 217: defframe(`SAVE_EBP',-16)
! 218: deflit(`FRAME',0)
! 219:
! 220: subl $16, %esp
! 221: deflit(`FRAME',16)
! 222:
! 223: movl %edi, SAVE_EDI
! 224: movl PARAM_WP, %edi
! 225:
! 226: movl %ebx, SAVE_EBX
! 227: movl %ebp, SAVE_EBP
! 228: movl %eax, %ebp
! 229:
! 230: movl %esi, SAVE_ESI
! 231: xorl %ebx, %ebx
! 232: leal (%edx,%ecx,4), %esi C xp end
! 233:
! 234: leal (%edi,%ecx,4), %edi C wp end of mul1
! 235: negl %ecx
! 236:
! 237:
! 238: L(mul1):
! 239: C eax scratch
! 240: C ebx carry
! 241: C ecx counter, negative
! 242: C edx scratch
! 243: C esi xp end
! 244: C edi wp end of mul1
! 245: C ebp multiplier
! 246:
! 247: movl (%esi,%ecx,4), %eax
! 248:
! 249: mull %ebp
! 250:
! 251: addl %ebx, %eax
! 252: movl %eax, (%edi,%ecx,4)
! 253: movl $0, %ebx
! 254:
! 255: adcl %edx, %ebx
! 256: incl %ecx
! 257: jnz L(mul1)
! 258:
! 259:
! 260: movl PARAM_YSIZE, %edx
! 261: movl PARAM_XSIZE, %ecx
! 262:
! 263: movl %ebx, (%edi) C final carry
! 264: decl %edx
! 265:
! 266: jnz L(ysize_more_than_one)
! 267:
! 268:
! 269: movl SAVE_EDI, %edi
! 270: movl SAVE_EBX, %ebx
! 271:
! 272: movl SAVE_EBP, %ebp
! 273: movl SAVE_ESI, %esi
! 274: addl $FRAME, %esp
! 275:
! 276: ret
! 277:
! 278:
! 279: L(ysize_more_than_one):
! 280: cmpl $UNROLL_THRESHOLD, %ecx
! 281: movl PARAM_YP, %eax
! 282:
! 283: jae L(unroll)
! 284:
! 285:
! 286: C -----------------------------------------------------------------------------
! 287: C simple addmul looping
! 288: C
! 289: C eax yp
! 290: C ebx
! 291: C ecx xsize
! 292: C edx ysize-1
! 293: C esi xp end
! 294: C edi wp end of mul1
! 295: C ebp
! 296:
! 297: leal 4(%eax,%edx,4), %ebp C yp end
! 298: negl %ecx
! 299: negl %edx
! 300:
! 301: movl (%esi,%ecx,4), %eax C xp low limb
! 302: movl %edx, PARAM_YSIZE C -(ysize-1)
! 303: incl %ecx
! 304:
! 305: xorl %ebx, %ebx C initial carry
! 306: movl %ecx, PARAM_XSIZE C -(xsize-1)
! 307: movl %ebp, PARAM_YP
! 308:
! 309: movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
! 310: jmp L(simple_outer_entry)
! 311:
! 312:
! 313: C this is offset 0x121 so close enough to aligned
! 314: L(simple_outer_top):
! 315: C ebp ysize counter, negative
! 316:
! 317: movl PARAM_YP, %edx
! 318: movl PARAM_XSIZE, %ecx C -(xsize-1)
! 319: xorl %ebx, %ebx C carry
! 320:
! 321: movl %ebp, PARAM_YSIZE
! 322: addl $4, %edi C next position in wp
! 323:
! 324: movl (%edx,%ebp,4), %ebp C yp limb - multiplier
! 325: movl -4(%esi,%ecx,4), %eax C xp low limb
! 326:
! 327:
! 328: L(simple_outer_entry):
! 329:
! 330: L(simple_inner):
! 331: C eax xp limb
! 332: C ebx carry limb
! 333: C ecx loop counter (negative)
! 334: C edx scratch
! 335: C esi xp end
! 336: C edi wp end
! 337: C ebp multiplier
! 338:
! 339: mull %ebp
! 340:
! 341: addl %eax, %ebx
! 342: adcl $0, %edx
! 343:
! 344: addl %ebx, (%edi,%ecx,4)
! 345: movl (%esi,%ecx,4), %eax
! 346: adcl $0, %edx
! 347:
! 348: incl %ecx
! 349: movl %edx, %ebx
! 350: jnz L(simple_inner)
! 351:
! 352:
! 353: mull %ebp
! 354:
! 355: movl PARAM_YSIZE, %ebp
! 356: addl %eax, %ebx
! 357:
! 358: adcl $0, %edx
! 359: addl %ebx, (%edi)
! 360:
! 361: adcl $0, %edx
! 362: incl %ebp
! 363:
! 364: movl %edx, 4(%edi)
! 365: jnz L(simple_outer_top)
! 366:
! 367:
! 368: movl SAVE_EBX, %ebx
! 369: movl SAVE_ESI, %esi
! 370:
! 371: movl SAVE_EDI, %edi
! 372: movl SAVE_EBP, %ebp
! 373: addl $FRAME, %esp
! 374:
! 375: ret
! 376:
! 377:
! 378:
! 379: C -----------------------------------------------------------------------------
! 380: C
! 381: C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
! 382: C comments.
! 383: C
! 384: C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
! 385: C increment xp and wp. This is used to adjust back xp and wp, and rshifted
! 386: C to given an initial VAR_COUNTER at the top of the outer loop.
! 387: C
! 388: C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
! 389: C up to -1, inclusive.
! 390: C
! 391: C VAR_JMP is the computed jump into the unrolled loop.
! 392: C
! 393: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
! 394: C start of the unrolled loop.
! 395: C
! 396: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
! 397: C inclusive.
! 398: C
! 399: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
! 400: C added to give the location of the next limb of yp, which is the multiplier
! 401: C in the unrolled loop.
! 402: C
! 403: C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
! 404: C outer loop to take care of xp, wp and the inner loop counter.
! 405:
! 406: defframe(VAR_COUNTER, -20)
! 407: defframe(VAR_ADJUST, -24)
! 408: defframe(VAR_JMP, -28)
! 409: defframe(VAR_XP_LOW, -32)
! 410: deflit(VAR_EXTRA_SPACE, 16)
! 411:
! 412:
! 413: L(unroll):
! 414: C eax yp
! 415: C ebx
! 416: C ecx xsize
! 417: C edx ysize-1
! 418: C esi xp end
! 419: C edi wp end of mul1
! 420: C ebp
! 421:
! 422: movl PARAM_XP, %esi
! 423: movl 4(%eax), %ebp C multiplier (yp second limb)
! 424: leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
! 425:
! 426: movl PARAM_WP, %edi
! 427: movl %eax, PARAM_YP
! 428: negl %edx
! 429:
! 430: movl %edx, PARAM_YSIZE
! 431: leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
! 432: decl %ecx C xsize-1
! 433:
! 434: movl (%esi), %eax C xp low limb
! 435: andl $-UNROLL_MASK-1, %ebx
! 436: negl %ecx
! 437:
! 438: subl $VAR_EXTRA_SPACE, %esp
! 439: deflit(`FRAME',16+VAR_EXTRA_SPACE)
! 440: negl %ebx
! 441: andl $UNROLL_MASK, %ecx
! 442:
! 443: movl %ebx, VAR_ADJUST
! 444: movl %ecx, %edx
! 445: shll $4, %ecx
! 446:
! 447: sarl $UNROLL_LOG2, %ebx
! 448:
! 449: C 17 code bytes per limb
! 450: ifdef(`PIC',`
! 451: call L(pic_calc)
! 452: L(unroll_here):
! 453: ',`
! 454: leal L(unroll_entry) (%ecx,%edx,1), %ecx
! 455: ')
! 456: negl %edx
! 457:
! 458: movl %eax, VAR_XP_LOW
! 459: movl %ecx, VAR_JMP
! 460: leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
! 461: leal 4(%esi,%edx,4), %esi C and start at second limb
! 462: jmp L(unroll_outer_entry)
! 463:
! 464:
! 465: ifdef(`PIC',`
! 466: L(pic_calc):
! 467: C See README.family about old gas bugs
! 468: leal (%ecx,%edx,1), %ecx
! 469: addl $L(unroll_entry)-L(unroll_here), %ecx
! 470: addl (%esp), %ecx
! 471: ret
! 472: ')
! 473:
! 474:
! 475: C --------------------------------------------------------------------------
! 476: ALIGN(32)
! 477: L(unroll_outer_top):
! 478: C ebp ysize counter, negative
! 479:
! 480: movl VAR_ADJUST, %ebx
! 481: movl PARAM_YP, %edx
! 482:
! 483: movl VAR_XP_LOW, %eax
! 484: movl %ebp, PARAM_YSIZE C store incremented ysize counter
! 485:
! 486: leal 4(%edi,%ebx,4), %edi
! 487: leal (%esi,%ebx,4), %esi
! 488: sarl $UNROLL_LOG2, %ebx
! 489:
! 490: movl (%edx,%ebp,4), %ebp C yp next multiplier
! 491: movl VAR_JMP, %ecx
! 492:
! 493: L(unroll_outer_entry):
! 494: mull %ebp
! 495:
! 496: testb $1, %cl C and clear carry bit
! 497: movl %ebx, VAR_COUNTER
! 498: movl $0, %ebx
! 499:
! 500: movl $0, %ecx
! 501: cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
! 502: cmovnz( %eax, %ebx)
! 503:
! 504: C Extra fetch of VAR_JMP is bad, but registers are tight
! 505: jmp *VAR_JMP
! 506:
! 507:
! 508: C -----------------------------------------------------------------------------
! 509: ALIGN(32)
! 510: L(unroll_top):
! 511: C eax xp limb
! 512: C ebx carry high
! 513: C ecx carry low
! 514: C edx scratch
! 515: C esi xp+8
! 516: C edi wp
! 517: C ebp yp multiplier limb
! 518: C
! 519: C VAR_COUNTER loop counter, negative
! 520: C
! 521: C 17 bytes each limb
! 522:
! 523: L(unroll_entry):
! 524:
! 525: deflit(CHUNK_COUNT,2)
! 526: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 527: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
! 528: deflit(`disp1', eval(disp0 + 4))
! 529:
! 530: Zdisp( movl, disp0,(%esi), %eax)
! 531: adcl %edx, %ebx
! 532:
! 533: mull %ebp
! 534:
! 535: Zdisp( addl, %ecx, disp0,(%edi))
! 536: movl $0, %ecx
! 537:
! 538: adcl %eax, %ebx
! 539:
! 540:
! 541: movl disp1(%esi), %eax
! 542: adcl %edx, %ecx
! 543:
! 544: mull %ebp
! 545:
! 546: addl %ebx, disp1(%edi)
! 547: movl $0, %ebx
! 548:
! 549: adcl %eax, %ecx
! 550: ')
! 551:
! 552:
! 553: incl VAR_COUNTER
! 554: leal UNROLL_BYTES(%esi), %esi
! 555: leal UNROLL_BYTES(%edi), %edi
! 556:
! 557: jnz L(unroll_top)
! 558:
! 559:
! 560: C eax
! 561: C ebx zero
! 562: C ecx low
! 563: C edx high
! 564: C esi
! 565: C edi wp, pointing at second last limb)
! 566: C ebp
! 567: C
! 568: C carry flag to be added to high
! 569:
! 570: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
! 571: deflit(`disp1', eval(disp0-0 + 4))
! 572:
! 573: movl PARAM_YSIZE, %ebp
! 574: adcl $0, %edx
! 575: addl %ecx, disp0(%edi)
! 576:
! 577: adcl $0, %edx
! 578: incl %ebp
! 579:
! 580: movl %edx, disp1(%edi)
! 581: jnz L(unroll_outer_top)
! 582:
! 583:
! 584: movl SAVE_ESI, %esi
! 585: movl SAVE_EBP, %ebp
! 586:
! 587: movl SAVE_EDI, %edi
! 588: movl SAVE_EBX, %ebx
! 589: addl $FRAME, %esp
! 590:
! 591: ret
! 592:
! 593: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>