Annotation of OpenXM_contrib/gmp/mpn/x86/k7/sqr_basecase.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
! 2: dnl
! 3: dnl K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
! 4: dnl (measured on the speed difference between 25 and 50 limbs, which is
! 5: dnl roughly the Karatsuba recursing range).
! 6:
! 7:
! 8: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 9: dnl
! 10: dnl This file is part of the GNU MP Library.
! 11: dnl
! 12: dnl The GNU MP Library is free software; you can redistribute it and/or
! 13: dnl modify it under the terms of the GNU Lesser General Public License as
! 14: dnl published by the Free Software Foundation; either version 2.1 of the
! 15: dnl License, or (at your option) any later version.
! 16: dnl
! 17: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 18: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 19: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 20: dnl Lesser General Public License for more details.
! 21: dnl
! 22: dnl You should have received a copy of the GNU Lesser General Public
! 23: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 24: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 25: dnl Suite 330, Boston, MA 02111-1307, USA.
! 26:
! 27:
! 28: include(`../config.m4')
! 29:
! 30:
! 31: dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
! 32: dnl some comments.
! 33:
! 34: deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
! 35:
! 36: ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
! 37: `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
! 38:
! 39: m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
! 40: deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
! 41:
! 42:
! 43: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
! 44: C
! 45: C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
! 46: C which is quite a bit, but is considered good value since squares big
! 47: C enough to use most of the code will be spending quite a few cycles in it.
! 48:
! 49:
! 50: defframe(PARAM_SIZE,12)
! 51: defframe(PARAM_SRC, 8)
! 52: defframe(PARAM_DST, 4)
! 53:
! 54: .text
! 55: ALIGN(32)
! 56: PROLOGUE(mpn_sqr_basecase)
! 57: deflit(`FRAME',0)
! 58:
! 59: movl PARAM_SIZE, %ecx
! 60: movl PARAM_SRC, %eax
! 61: cmpl $2, %ecx
! 62:
! 63: movl PARAM_DST, %edx
! 64: je L(two_limbs)
! 65: ja L(three_or_more)
! 66:
! 67:
! 68: C------------------------------------------------------------------------------
! 69: C one limb only
! 70: C eax src
! 71: C ecx size
! 72: C edx dst
! 73:
! 74: movl (%eax), %eax
! 75: movl %edx, %ecx
! 76:
! 77: mull %eax
! 78:
! 79: movl %edx, 4(%ecx)
! 80: movl %eax, (%ecx)
! 81: ret
! 82:
! 83:
! 84: C------------------------------------------------------------------------------
! 85: C
! 86: C Using the read/modify/write "add"s seems to be faster than saving and
! 87: C restoring registers. Perhaps the loads for the first set hide under the
! 88: C mul latency and the second gets store to load forwarding.
! 89:
! 90: ALIGN(16)
! 91: L(two_limbs):
! 92: C eax src
! 93: C ebx
! 94: C ecx size
! 95: C edx dst
! 96: deflit(`FRAME',0)
! 97:
! 98: pushl %ebx FRAME_pushl()
! 99: movl %eax, %ebx C src
! 100: movl (%eax), %eax
! 101:
! 102: movl %edx, %ecx C dst
! 103:
! 104: mull %eax C src[0]^2
! 105:
! 106: movl %eax, (%ecx) C dst[0]
! 107: movl 4(%ebx), %eax
! 108:
! 109: movl %edx, 4(%ecx) C dst[1]
! 110:
! 111: mull %eax C src[1]^2
! 112:
! 113: movl %eax, 8(%ecx) C dst[2]
! 114: movl (%ebx), %eax
! 115:
! 116: movl %edx, 12(%ecx) C dst[3]
! 117:
! 118: mull 4(%ebx) C src[0]*src[1]
! 119:
! 120: popl %ebx
! 121:
! 122: addl %eax, 4(%ecx)
! 123: adcl %edx, 8(%ecx)
! 124: adcl $0, 12(%ecx)
! 125: ASSERT(nc)
! 126:
! 127: addl %eax, 4(%ecx)
! 128: adcl %edx, 8(%ecx)
! 129: adcl $0, 12(%ecx)
! 130: ASSERT(nc)
! 131:
! 132: ret
! 133:
! 134:
! 135: C------------------------------------------------------------------------------
! 136: defframe(SAVE_EBX, -4)
! 137: defframe(SAVE_ESI, -8)
! 138: defframe(SAVE_EDI, -12)
! 139: defframe(SAVE_EBP, -16)
! 140: deflit(STACK_SPACE, 16)
! 141:
! 142: L(three_or_more):
! 143: subl $STACK_SPACE, %esp
! 144: cmpl $4, %ecx
! 145: jae L(four_or_more)
! 146: deflit(`FRAME',STACK_SPACE)
! 147:
! 148:
! 149: C------------------------------------------------------------------------------
! 150: C Three limbs
! 151: C
! 152: C Writing out the loads and stores separately at the end of this code comes
! 153: C out about 10 cycles faster than using adcls to memory.
! 154:
! 155: C eax src
! 156: C ecx size
! 157: C edx dst
! 158:
! 159: movl %ebx, SAVE_EBX
! 160: movl %eax, %ebx C src
! 161: movl (%eax), %eax
! 162:
! 163: movl %edx, %ecx C dst
! 164: movl %esi, SAVE_ESI
! 165: movl %edi, SAVE_EDI
! 166:
! 167: mull %eax C src[0] ^ 2
! 168:
! 169: movl %eax, (%ecx)
! 170: movl 4(%ebx), %eax
! 171: movl %edx, 4(%ecx)
! 172:
! 173: mull %eax C src[1] ^ 2
! 174:
! 175: movl %eax, 8(%ecx)
! 176: movl 8(%ebx), %eax
! 177: movl %edx, 12(%ecx)
! 178:
! 179: mull %eax C src[2] ^ 2
! 180:
! 181: movl %eax, 16(%ecx)
! 182: movl (%ebx), %eax
! 183: movl %edx, 20(%ecx)
! 184:
! 185: mull 4(%ebx) C src[0] * src[1]
! 186:
! 187: movl %eax, %esi
! 188: movl (%ebx), %eax
! 189: movl %edx, %edi
! 190:
! 191: mull 8(%ebx) C src[0] * src[2]
! 192:
! 193: addl %eax, %edi
! 194: movl %ebp, SAVE_EBP
! 195: movl $0, %ebp
! 196:
! 197: movl 4(%ebx), %eax
! 198: adcl %edx, %ebp
! 199:
! 200: mull 8(%ebx) C src[1] * src[2]
! 201:
! 202: xorl %ebx, %ebx
! 203: addl %eax, %ebp
! 204:
! 205: adcl $0, %edx
! 206:
! 207: C eax
! 208: C ebx zero, will be dst[5]
! 209: C ecx dst
! 210: C edx dst[4]
! 211: C esi dst[1]
! 212: C edi dst[2]
! 213: C ebp dst[3]
! 214:
! 215: adcl $0, %edx
! 216: addl %esi, %esi
! 217:
! 218: adcl %edi, %edi
! 219: movl 4(%ecx), %eax
! 220:
! 221: adcl %ebp, %ebp
! 222:
! 223: adcl %edx, %edx
! 224:
! 225: adcl $0, %ebx
! 226: addl %eax, %esi
! 227: movl 8(%ecx), %eax
! 228:
! 229: adcl %eax, %edi
! 230: movl 12(%ecx), %eax
! 231: movl %esi, 4(%ecx)
! 232:
! 233: adcl %eax, %ebp
! 234: movl 16(%ecx), %eax
! 235: movl %edi, 8(%ecx)
! 236:
! 237: movl SAVE_ESI, %esi
! 238: movl SAVE_EDI, %edi
! 239:
! 240: adcl %eax, %edx
! 241: movl 20(%ecx), %eax
! 242: movl %ebp, 12(%ecx)
! 243:
! 244: adcl %ebx, %eax
! 245: ASSERT(nc)
! 246: movl SAVE_EBX, %ebx
! 247: movl SAVE_EBP, %ebp
! 248:
! 249: movl %edx, 16(%ecx)
! 250: movl %eax, 20(%ecx)
! 251: addl $FRAME, %esp
! 252:
! 253: ret
! 254:
! 255:
! 256: C------------------------------------------------------------------------------
! 257: L(four_or_more):
! 258:
! 259: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
! 260: C Further products are added in rather than stored.
! 261:
! 262: C eax src
! 263: C ebx
! 264: C ecx size
! 265: C edx dst
! 266: C esi
! 267: C edi
! 268: C ebp
! 269:
! 270: defframe(`VAR_COUNTER',-20)
! 271: defframe(`VAR_JMP', -24)
! 272: deflit(EXTRA_STACK_SPACE, 8)
! 273:
! 274: movl %ebx, SAVE_EBX
! 275: movl %edi, SAVE_EDI
! 276: leal (%edx,%ecx,4), %edi C &dst[size]
! 277:
! 278: movl %esi, SAVE_ESI
! 279: movl %ebp, SAVE_EBP
! 280: leal (%eax,%ecx,4), %esi C &src[size]
! 281:
! 282: movl (%eax), %ebp C multiplier
! 283: movl $0, %ebx
! 284: decl %ecx
! 285:
! 286: negl %ecx
! 287: subl $EXTRA_STACK_SPACE, %esp
! 288: FRAME_subl_esp(EXTRA_STACK_SPACE)
! 289:
! 290: L(mul_1):
! 291: C eax scratch
! 292: C ebx carry
! 293: C ecx counter
! 294: C edx scratch
! 295: C esi &src[size]
! 296: C edi &dst[size]
! 297: C ebp multiplier
! 298:
! 299: movl (%esi,%ecx,4), %eax
! 300:
! 301: mull %ebp
! 302:
! 303: addl %ebx, %eax
! 304: movl %eax, (%edi,%ecx,4)
! 305: movl $0, %ebx
! 306:
! 307: adcl %edx, %ebx
! 308: incl %ecx
! 309: jnz L(mul_1)
! 310:
! 311:
! 312: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
! 313: C
! 314: C The last two products, which are the bottom right corner of the product
! 315: C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
! 316: C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
! 317: C cases that need to be done.
! 318: C
! 319: C The unrolled code is the same as in mpn_addmul_1, see that routine for
! 320: C some comments.
! 321: C
! 322: C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
! 323: C
! 324: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
! 325: C chunk each outer loop.
! 326: C
! 327: C K7 does branch prediction on indirect jumps, which is bad since it's a
! 328: C different target each time. There seems no way to avoid this.
! 329:
! 330: dnl This value also hard coded in some shifts and adds
! 331: deflit(CODE_BYTES_PER_LIMB, 17)
! 332:
! 333: dnl With the unmodified &src[size] and &dst[size] pointers, the
! 334: dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
! 335: dnl values up to 31, but above that an offset must be added to them.
! 336:
! 337: deflit(OFFSET,
! 338: ifelse(eval(UNROLL_COUNT>31),1,
! 339: eval((UNROLL_COUNT-31)*4),
! 340: 0))
! 341:
! 342: dnl Because the last chunk of code is generated differently, a label placed
! 343: dnl at the end doesn't work. Instead calculate the implied end using the
! 344: dnl start and how many chunks of code there are.
! 345:
! 346: deflit(UNROLL_INNER_END,
! 347: `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
! 348:
! 349: C eax
! 350: C ebx carry
! 351: C ecx
! 352: C edx
! 353: C esi &src[size]
! 354: C edi &dst[size]
! 355: C ebp
! 356:
! 357: movl PARAM_SIZE, %ecx
! 358: movl %ebx, (%edi)
! 359:
! 360: subl $4, %ecx
! 361: jz L(corner)
! 362:
! 363: negl %ecx
! 364: ifelse(OFFSET,0,,`subl $OFFSET, %edi')
! 365: ifelse(OFFSET,0,,`subl $OFFSET, %esi')
! 366:
! 367: movl %ecx, %edx
! 368: shll $4, %ecx
! 369:
! 370: ifdef(`PIC',`
! 371: call L(pic_calc)
! 372: L(here):
! 373: ',`
! 374: leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
! 375: ')
! 376:
! 377:
! 378: C The calculated jump mustn't come out to before the start of the
! 379: C code available. This is the limit UNROLL_COUNT puts on the src
! 380: C operand size, but checked here directly using the jump address.
! 381: ASSERT(ae,
! 382: `movl_text_address(L(unroll_inner_start), %eax)
! 383: cmpl %eax, %ecx')
! 384:
! 385:
! 386: C------------------------------------------------------------------------------
! 387: ALIGN(16)
! 388: L(unroll_outer_top):
! 389: C eax
! 390: C ebx high limb to store
! 391: C ecx VAR_JMP
! 392: C edx VAR_COUNTER, limbs, negative
! 393: C esi &src[size], constant
! 394: C edi dst ptr, high of last addmul
! 395: C ebp
! 396:
! 397: movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
! 398: movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
! 399:
! 400: movl %edx, VAR_COUNTER
! 401:
! 402: mull %ebp
! 403:
! 404: define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
! 405:
! 406: testb $1, %cl
! 407: movl %edx, %ebx C high carry
! 408: movl %ecx, %edx C jump
! 409:
! 410: movl %eax, %ecx C low carry
! 411: cmovX( %ebx, %ecx) C high carry reverse
! 412: cmovX( %eax, %ebx) C low carry reverse
! 413:
! 414: leal CODE_BYTES_PER_LIMB(%edx), %eax
! 415: xorl %edx, %edx
! 416: leal 4(%edi), %edi
! 417:
! 418: movl %eax, VAR_JMP
! 419:
! 420: jmp *%eax
! 421:
! 422:
! 423: ifdef(`PIC',`
! 424: L(pic_calc):
! 425: addl (%esp), %ecx
! 426: addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
! 427: addl %edx, %ecx
! 428: ret
! 429: ')
! 430:
! 431:
! 432: C Must be an even address to preserve the significance of the low
! 433: C bit of the jump address indicating which way around ecx/ebx should
! 434: C start.
! 435: ALIGN(2)
! 436:
! 437: L(unroll_inner_start):
! 438: C eax next limb
! 439: C ebx carry high
! 440: C ecx carry low
! 441: C edx scratch
! 442: C esi src
! 443: C edi dst
! 444: C ebp multiplier
! 445:
! 446: forloop(`i', UNROLL_COUNT, 1, `
! 447: deflit(`disp_src', eval(-i*4 + OFFSET))
! 448: deflit(`disp_dst', eval(disp_src - 4))
! 449:
! 450: m4_assert(`disp_src>=-128 && disp_src<128')
! 451: m4_assert(`disp_dst>=-128 && disp_dst<128')
! 452:
! 453: ifelse(eval(i%2),0,`
! 454: Zdisp( movl, disp_src,(%esi), %eax)
! 455: adcl %edx, %ebx
! 456:
! 457: mull %ebp
! 458:
! 459: Zdisp( addl, %ecx, disp_dst,(%edi))
! 460: movl $0, %ecx
! 461:
! 462: adcl %eax, %ebx
! 463:
! 464: ',`
! 465: dnl this bit comes out last
! 466: Zdisp( movl, disp_src,(%esi), %eax)
! 467: adcl %edx, %ecx
! 468:
! 469: mull %ebp
! 470:
! 471: dnl Zdisp( addl %ebx, disp_src,(%edi))
! 472: addl %ebx, disp_dst(%edi)
! 473: ifelse(forloop_last,0,
! 474: ` movl $0, %ebx')
! 475:
! 476: adcl %eax, %ecx
! 477: ')
! 478: ')
! 479:
! 480: C eax next limb
! 481: C ebx carry high
! 482: C ecx carry low
! 483: C edx scratch
! 484: C esi src
! 485: C edi dst
! 486: C ebp multiplier
! 487:
! 488: adcl $0, %edx
! 489: addl %ecx, -4+OFFSET(%edi)
! 490: movl VAR_JMP, %ecx
! 491:
! 492: adcl $0, %edx
! 493:
! 494: movl %edx, m4_empty_if_zero(OFFSET) (%edi)
! 495: movl VAR_COUNTER, %edx
! 496:
! 497: incl %edx
! 498: jnz L(unroll_outer_top)
! 499:
! 500:
! 501: ifelse(OFFSET,0,,`
! 502: addl $OFFSET, %esi
! 503: addl $OFFSET, %edi
! 504: ')
! 505:
! 506:
! 507: C------------------------------------------------------------------------------
! 508: L(corner):
! 509: C esi &src[size]
! 510: C edi &dst[2*size-5]
! 511:
! 512: movl -12(%esi), %ebp
! 513: movl -8(%esi), %eax
! 514: movl %eax, %ecx
! 515:
! 516: mull %ebp
! 517:
! 518: addl %eax, -4(%edi)
! 519: movl -4(%esi), %eax
! 520:
! 521: adcl $0, %edx
! 522: movl %edx, %ebx
! 523: movl %eax, %esi
! 524:
! 525: mull %ebp
! 526:
! 527: addl %ebx, %eax
! 528:
! 529: adcl $0, %edx
! 530: addl %eax, (%edi)
! 531: movl %esi, %eax
! 532:
! 533: adcl $0, %edx
! 534: movl %edx, %ebx
! 535:
! 536: mull %ecx
! 537:
! 538: addl %ebx, %eax
! 539: movl %eax, 4(%edi)
! 540:
! 541: adcl $0, %edx
! 542: movl %edx, 8(%edi)
! 543:
! 544:
! 545:
! 546: C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
! 547:
! 548: L(lshift_start):
! 549: movl PARAM_SIZE, %eax
! 550: movl PARAM_DST, %edi
! 551: xorl %ecx, %ecx C clear carry
! 552:
! 553: leal (%edi,%eax,8), %edi
! 554: notl %eax C -size-1, preserve carry
! 555:
! 556: leal 2(%eax), %eax C -(size-1)
! 557:
! 558: L(lshift):
! 559: C eax counter, negative
! 560: C ebx
! 561: C ecx
! 562: C edx
! 563: C esi
! 564: C edi dst, pointing just after last limb
! 565: C ebp
! 566:
! 567: rcll -4(%edi,%eax,8)
! 568: rcll (%edi,%eax,8)
! 569: incl %eax
! 570: jnz L(lshift)
! 571:
! 572: setc %al
! 573:
! 574: movl PARAM_SRC, %esi
! 575: movl %eax, -4(%edi) C dst most significant limb
! 576:
! 577: movl PARAM_SIZE, %ecx
! 578:
! 579:
! 580: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
! 581: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
! 582: C low limb of src[0]^2.
! 583:
! 584: movl (%esi), %eax C src[0]
! 585:
! 586: mull %eax
! 587:
! 588: leal (%esi,%ecx,4), %esi C src point just after last limb
! 589: negl %ecx
! 590:
! 591: movl %eax, (%edi,%ecx,8) C dst[0]
! 592: incl %ecx
! 593:
! 594: L(diag):
! 595: C eax scratch
! 596: C ebx scratch
! 597: C ecx counter, negative
! 598: C edx carry
! 599: C esi src just after last limb
! 600: C edi dst just after last limb
! 601: C ebp
! 602:
! 603: movl (%esi,%ecx,4), %eax
! 604: movl %edx, %ebx
! 605:
! 606: mull %eax
! 607:
! 608: addl %ebx, -4(%edi,%ecx,8)
! 609: adcl %eax, (%edi,%ecx,8)
! 610: adcl $0, %edx
! 611:
! 612: incl %ecx
! 613: jnz L(diag)
! 614:
! 615:
! 616: movl SAVE_ESI, %esi
! 617: movl SAVE_EBX, %ebx
! 618:
! 619: addl %edx, -4(%edi) C dst most significant limb
! 620: movl SAVE_EDI, %edi
! 621:
! 622: movl SAVE_EBP, %ebp
! 623: addl $FRAME, %esp
! 624:
! 625: ret
! 626:
! 627: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>