Annotation of OpenXM_contrib/gmp/mpn/x86/p6/sqr_basecase.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
! 2: dnl
! 3: dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
! 4: dnl product (measured on the speed difference between 20 and 40 limbs,
! 5: dnl which is the Karatsuba recursing range).
! 6:
! 7:
! 8: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 9: dnl
! 10: dnl This file is part of the GNU MP Library.
! 11: dnl
! 12: dnl The GNU MP Library is free software; you can redistribute it and/or
! 13: dnl modify it under the terms of the GNU Lesser General Public License as
! 14: dnl published by the Free Software Foundation; either version 2.1 of the
! 15: dnl License, or (at your option) any later version.
! 16: dnl
! 17: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 18: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 19: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 20: dnl Lesser General Public License for more details.
! 21: dnl
! 22: dnl You should have received a copy of the GNU Lesser General Public
! 23: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 24: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 25: dnl Suite 330, Boston, MA 02111-1307, USA.
! 26:
! 27:
! 28: include(`../config.m4')
! 29:
! 30:
! 31: dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
! 32: dnl a description. The only difference here is that UNROLL_COUNT can go up
! 33: dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
! 34:
! 35: deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
! 36:
! 37: ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
! 38: `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
! 39:
! 40: m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
! 41: deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
! 42:
! 43:
! 44: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
! 45: C
! 46: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
! 47: C lot of function call overheads are avoided, especially when the given size
! 48: C is small.
! 49: C
! 50: C The code size might look a bit excessive, but not all of it is executed so
! 51: C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
! 52: C clearly apply only to those sizes; mid sizes like 10x10 only need part of
! 53: C the unrolled addmul; and big sizes like 40x40 that do use the full
! 54: C unrolling will least be making good use of it, because 40x40 will take
! 55: C something like 7000 cycles.
! 56:
! 57: defframe(PARAM_SIZE,12)
! 58: defframe(PARAM_SRC, 8)
! 59: defframe(PARAM_DST, 4)
! 60:
! 61: .text
! 62: ALIGN(32)
! 63: PROLOGUE(mpn_sqr_basecase)
! 64: deflit(`FRAME',0)
! 65:
! 66: movl PARAM_SIZE, %edx
! 67:
! 68: movl PARAM_SRC, %eax
! 69:
! 70: cmpl $2, %edx
! 71: movl PARAM_DST, %ecx
! 72: je L(two_limbs)
! 73:
! 74: movl (%eax), %eax
! 75: ja L(three_or_more)
! 76:
! 77:
! 78: C -----------------------------------------------------------------------------
! 79: C one limb only
! 80: C eax src limb
! 81: C ebx
! 82: C ecx dst
! 83: C edx
! 84:
! 85: mull %eax
! 86:
! 87: movl %eax, (%ecx)
! 88: movl %edx, 4(%ecx)
! 89:
! 90: ret
! 91:
! 92:
! 93: C -----------------------------------------------------------------------------
! 94: L(two_limbs):
! 95: C eax src
! 96: C ebx
! 97: C ecx dst
! 98: C edx
! 99:
! 100: defframe(SAVE_ESI, -4)
! 101: defframe(SAVE_EBX, -8)
! 102: defframe(SAVE_EDI, -12)
! 103: defframe(SAVE_EBP, -16)
! 104: deflit(`STACK_SPACE',16)
! 105:
! 106: subl $STACK_SPACE, %esp
! 107: deflit(`FRAME',STACK_SPACE)
! 108:
! 109: movl %esi, SAVE_ESI
! 110: movl %eax, %esi
! 111: movl (%eax), %eax
! 112:
! 113: mull %eax C src[0]^2
! 114:
! 115: movl %eax, (%ecx) C dst[0]
! 116: movl 4(%esi), %eax
! 117:
! 118: movl %ebx, SAVE_EBX
! 119: movl %edx, %ebx C dst[1]
! 120:
! 121: mull %eax C src[1]^2
! 122:
! 123: movl %edi, SAVE_EDI
! 124: movl %eax, %edi C dst[2]
! 125: movl (%esi), %eax
! 126:
! 127: movl %ebp, SAVE_EBP
! 128: movl %edx, %ebp C dst[3]
! 129:
! 130: mull 4(%esi) C src[0]*src[1]
! 131:
! 132: addl %eax, %ebx
! 133: movl SAVE_ESI, %esi
! 134:
! 135: adcl %edx, %edi
! 136:
! 137: adcl $0, %ebp
! 138: addl %ebx, %eax
! 139: movl SAVE_EBX, %ebx
! 140:
! 141: adcl %edi, %edx
! 142: movl SAVE_EDI, %edi
! 143:
! 144: adcl $0, %ebp
! 145:
! 146: movl %eax, 4(%ecx)
! 147:
! 148: movl %ebp, 12(%ecx)
! 149: movl SAVE_EBP, %ebp
! 150:
! 151: movl %edx, 8(%ecx)
! 152: addl $FRAME, %esp
! 153:
! 154: ret
! 155:
! 156:
! 157: C -----------------------------------------------------------------------------
! 158: L(three_or_more):
! 159: C eax src low limb
! 160: C ebx
! 161: C ecx dst
! 162: C edx size
! 163: deflit(`FRAME',0)
! 164:
! 165: pushl %esi defframe_pushl(`SAVE_ESI')
! 166: cmpl $4, %edx
! 167:
! 168: movl PARAM_SRC, %esi
! 169: jae L(four_or_more)
! 170:
! 171:
! 172: C -----------------------------------------------------------------------------
! 173: C three limbs
! 174:
! 175: C eax src low limb
! 176: C ebx
! 177: C ecx dst
! 178: C edx
! 179: C esi src
! 180: C edi
! 181: C ebp
! 182:
! 183: pushl %ebp defframe_pushl(`SAVE_EBP')
! 184: pushl %edi defframe_pushl(`SAVE_EDI')
! 185:
! 186: mull %eax C src[0] ^ 2
! 187:
! 188: movl %eax, (%ecx)
! 189: movl %edx, 4(%ecx)
! 190:
! 191: movl 4(%esi), %eax
! 192: xorl %ebp, %ebp
! 193:
! 194: mull %eax C src[1] ^ 2
! 195:
! 196: movl %eax, 8(%ecx)
! 197: movl %edx, 12(%ecx)
! 198: movl 8(%esi), %eax
! 199:
! 200: pushl %ebx defframe_pushl(`SAVE_EBX')
! 201:
! 202: mull %eax C src[2] ^ 2
! 203:
! 204: movl %eax, 16(%ecx)
! 205: movl %edx, 20(%ecx)
! 206:
! 207: movl (%esi), %eax
! 208:
! 209: mull 4(%esi) C src[0] * src[1]
! 210:
! 211: movl %eax, %ebx
! 212: movl %edx, %edi
! 213:
! 214: movl (%esi), %eax
! 215:
! 216: mull 8(%esi) C src[0] * src[2]
! 217:
! 218: addl %eax, %edi
! 219: movl %edx, %ebp
! 220:
! 221: adcl $0, %ebp
! 222: movl 4(%esi), %eax
! 223:
! 224: mull 8(%esi) C src[1] * src[2]
! 225:
! 226: xorl %esi, %esi
! 227: addl %eax, %ebp
! 228:
! 229: C eax
! 230: C ebx dst[1]
! 231: C ecx dst
! 232: C edx dst[4]
! 233: C esi zero, will be dst[5]
! 234: C edi dst[2]
! 235: C ebp dst[3]
! 236:
! 237: adcl $0, %edx
! 238: addl %ebx, %ebx
! 239:
! 240: adcl %edi, %edi
! 241:
! 242: adcl %ebp, %ebp
! 243:
! 244: adcl %edx, %edx
! 245: movl 4(%ecx), %eax
! 246:
! 247: adcl $0, %esi
! 248: addl %ebx, %eax
! 249:
! 250: movl %eax, 4(%ecx)
! 251: movl 8(%ecx), %eax
! 252:
! 253: adcl %edi, %eax
! 254: movl 12(%ecx), %ebx
! 255:
! 256: adcl %ebp, %ebx
! 257: movl 16(%ecx), %edi
! 258:
! 259: movl %eax, 8(%ecx)
! 260: movl SAVE_EBP, %ebp
! 261:
! 262: movl %ebx, 12(%ecx)
! 263: movl SAVE_EBX, %ebx
! 264:
! 265: adcl %edx, %edi
! 266: movl 20(%ecx), %eax
! 267:
! 268: movl %edi, 16(%ecx)
! 269: movl SAVE_EDI, %edi
! 270:
! 271: adcl %esi, %eax C no carry out of this
! 272: movl SAVE_ESI, %esi
! 273:
! 274: movl %eax, 20(%ecx)
! 275: addl $FRAME, %esp
! 276:
! 277: ret
! 278:
! 279:
! 280:
! 281: C -----------------------------------------------------------------------------
! 282: defframe(VAR_COUNTER,-20)
! 283: defframe(VAR_JMP, -24)
! 284: deflit(`STACK_SPACE',24)
! 285:
! 286: L(four_or_more):
! 287: C eax src low limb
! 288: C ebx
! 289: C ecx
! 290: C edx size
! 291: C esi src
! 292: C edi
! 293: C ebp
! 294: deflit(`FRAME',4) dnl %esi already pushed
! 295:
! 296: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
! 297:
! 298: subl $STACK_SPACE-FRAME, %esp
! 299: deflit(`FRAME',STACK_SPACE)
! 300: movl $1, %ecx
! 301:
! 302: movl %edi, SAVE_EDI
! 303: movl PARAM_DST, %edi
! 304:
! 305: movl %ebx, SAVE_EBX
! 306: subl %edx, %ecx C -(size-1)
! 307:
! 308: movl %ebp, SAVE_EBP
! 309: movl $0, %ebx C initial carry
! 310:
! 311: leal (%esi,%edx,4), %esi C &src[size]
! 312: movl %eax, %ebp C multiplier
! 313:
! 314: leal -4(%edi,%edx,4), %edi C &dst[size-1]
! 315:
! 316:
! 317: C This loop runs at just over 6 c/l.
! 318:
! 319: L(mul_1):
! 320: C eax scratch
! 321: C ebx carry
! 322: C ecx counter, limbs, negative, -(size-1) to -1
! 323: C edx scratch
! 324: C esi &src[size]
! 325: C edi &dst[size-1]
! 326: C ebp multiplier
! 327:
! 328: movl %ebp, %eax
! 329:
! 330: mull (%esi,%ecx,4)
! 331:
! 332: addl %ebx, %eax
! 333: movl $0, %ebx
! 334:
! 335: adcl %edx, %ebx
! 336: movl %eax, 4(%edi,%ecx,4)
! 337:
! 338: incl %ecx
! 339: jnz L(mul_1)
! 340:
! 341:
! 342: movl %ebx, 4(%edi)
! 343:
! 344:
! 345: C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
! 346: C
! 347: C The last two addmuls, which are the bottom right corner of the product
! 348: C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
! 349: C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
! 350: C cases that need to be done.
! 351: C
! 352: C The unrolled code is the same as mpn_addmul_1(), see that routine for some
! 353: C comments.
! 354: C
! 355: C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
! 356: C
! 357: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
! 358: C chunk each outer loop.
! 359:
! 360: dnl This is also hard-coded in the address calculation below.
! 361: deflit(CODE_BYTES_PER_LIMB, 15)
! 362:
! 363: dnl With &src[size] and &dst[size-1] pointers, the displacements in the
! 364: dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
! 365: dnl that an offset must be added to them.
! 366: deflit(OFFSET,
! 367: ifelse(eval(UNROLL_COUNT>32),1,
! 368: eval((UNROLL_COUNT-32)*4),
! 369: 0))
! 370:
! 371: C eax
! 372: C ebx carry
! 373: C ecx
! 374: C edx
! 375: C esi &src[size]
! 376: C edi &dst[size-1]
! 377: C ebp
! 378:
! 379: movl PARAM_SIZE, %ecx
! 380:
! 381: subl $4, %ecx
! 382: jz L(corner)
! 383:
! 384: movl %ecx, %edx
! 385: negl %ecx
! 386:
! 387: shll $4, %ecx
! 388: ifelse(OFFSET,0,,`subl $OFFSET, %esi')
! 389:
! 390: ifdef(`PIC',`
! 391: call L(pic_calc)
! 392: L(here):
! 393: ',`
! 394: leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
! 395: ')
! 396: negl %edx
! 397:
! 398: ifelse(OFFSET,0,,`subl $OFFSET, %edi')
! 399:
! 400: C The calculated jump mustn't be before the start of the available
! 401: C code. This is the limit that UNROLL_COUNT puts on the src operand
! 402: C size, but checked here using the jump address directly.
! 403:
! 404: ASSERT(ae,
! 405: `movl_text_address( L(unroll_inner_start), %eax)
! 406: cmpl %eax, %ecx')
! 407:
! 408:
! 409: C -----------------------------------------------------------------------------
! 410: ALIGN(16)
! 411: L(unroll_outer_top):
! 412: C eax
! 413: C ebx high limb to store
! 414: C ecx VAR_JMP
! 415: C edx VAR_COUNTER, limbs, negative
! 416: C esi &src[size], constant
! 417: C edi dst ptr, second highest limb of last addmul
! 418: C ebp
! 419:
! 420: movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
! 421: movl %edx, VAR_COUNTER
! 422:
! 423: movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
! 424:
! 425: mull %ebp
! 426:
! 427: define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
! 428:
! 429: testb $1, %cl
! 430:
! 431: movl %edx, %ebx C high carry
! 432: leal 4(%edi), %edi
! 433:
! 434: movl %ecx, %edx C jump
! 435:
! 436: movl %eax, %ecx C low carry
! 437: leal CODE_BYTES_PER_LIMB(%edx), %edx
! 438:
! 439: cmovX( %ebx, %ecx) C high carry reverse
! 440: cmovX( %eax, %ebx) C low carry reverse
! 441: movl %edx, VAR_JMP
! 442: jmp *%edx
! 443:
! 444:
! 445: C Must be on an even address here so the low bit of the jump address
! 446: C will indicate which way around ecx/ebx should start.
! 447:
! 448: ALIGN(2)
! 449:
! 450: L(unroll_inner_start):
! 451: C eax scratch
! 452: C ebx carry high
! 453: C ecx carry low
! 454: C edx scratch
! 455: C esi src pointer
! 456: C edi dst pointer
! 457: C ebp multiplier
! 458: C
! 459: C 15 code bytes each limb
! 460: C ecx/ebx reversed on each chunk
! 461:
! 462: forloop(`i', UNROLL_COUNT, 1, `
! 463: deflit(`disp_src', eval(-i*4 + OFFSET))
! 464: deflit(`disp_dst', eval(disp_src))
! 465:
! 466: m4_assert(`disp_src>=-128 && disp_src<128')
! 467: m4_assert(`disp_dst>=-128 && disp_dst<128')
! 468:
! 469: ifelse(eval(i%2),0,`
! 470: Zdisp( movl, disp_src,(%esi), %eax)
! 471: mull %ebp
! 472: Zdisp( addl, %ebx, disp_dst,(%edi))
! 473: adcl %eax, %ecx
! 474: movl %edx, %ebx
! 475: adcl $0, %ebx
! 476: ',`
! 477: dnl this one comes out last
! 478: Zdisp( movl, disp_src,(%esi), %eax)
! 479: mull %ebp
! 480: Zdisp( addl, %ecx, disp_dst,(%edi))
! 481: adcl %eax, %ebx
! 482: movl %edx, %ecx
! 483: adcl $0, %ecx
! 484: ')
! 485: ')
! 486: L(unroll_inner_end):
! 487:
! 488: addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
! 489:
! 490: movl VAR_COUNTER, %edx
! 491: adcl $0, %ecx
! 492:
! 493: movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
! 494: movl VAR_JMP, %ecx
! 495:
! 496: incl %edx
! 497: jnz L(unroll_outer_top)
! 498:
! 499:
! 500: ifelse(OFFSET,0,,`
! 501: addl $OFFSET, %esi
! 502: addl $OFFSET, %edi
! 503: ')
! 504:
! 505:
! 506: C -----------------------------------------------------------------------------
! 507: ALIGN(16)
! 508: L(corner):
! 509: C eax
! 510: C ebx
! 511: C ecx
! 512: C edx
! 513: C esi &src[size]
! 514: C edi &dst[2*size-5]
! 515: C ebp
! 516:
! 517: movl -12(%esi), %eax
! 518:
! 519: mull -8(%esi)
! 520:
! 521: addl %eax, (%edi)
! 522: movl -12(%esi), %eax
! 523: movl $0, %ebx
! 524:
! 525: adcl %edx, %ebx
! 526:
! 527: mull -4(%esi)
! 528:
! 529: addl %eax, %ebx
! 530: movl -8(%esi), %eax
! 531:
! 532: adcl $0, %edx
! 533:
! 534: addl %ebx, 4(%edi)
! 535: movl $0, %ebx
! 536:
! 537: adcl %edx, %ebx
! 538:
! 539: mull -4(%esi)
! 540:
! 541: movl PARAM_SIZE, %ecx
! 542: addl %ebx, %eax
! 543:
! 544: adcl $0, %edx
! 545:
! 546: movl %eax, 8(%edi)
! 547:
! 548: movl %edx, 12(%edi)
! 549: movl PARAM_DST, %edi
! 550:
! 551:
! 552: C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
! 553:
! 554: subl $1, %ecx C size-1
! 555: xorl %eax, %eax C ready for final adcl, and clear carry
! 556:
! 557: movl %ecx, %edx
! 558: movl PARAM_SRC, %esi
! 559:
! 560:
! 561: L(lshift):
! 562: C eax
! 563: C ebx
! 564: C ecx counter, size-1 to 1
! 565: C edx size-1 (for later use)
! 566: C esi src (for later use)
! 567: C edi dst, incrementing
! 568: C ebp
! 569:
! 570: rcll 4(%edi)
! 571: rcll 8(%edi)
! 572:
! 573: leal 8(%edi), %edi
! 574: decl %ecx
! 575: jnz L(lshift)
! 576:
! 577:
! 578: adcl %eax, %eax
! 579:
! 580: movl %eax, 4(%edi) C dst most significant limb
! 581: movl (%esi), %eax C src[0]
! 582:
! 583: leal 4(%esi,%edx,4), %esi C &src[size]
! 584: subl %edx, %ecx C -(size-1)
! 585:
! 586:
! 587: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
! 588: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
! 589: C low limb of src[0]^2.
! 590:
! 591:
! 592: mull %eax
! 593:
! 594: movl %eax, (%edi,%ecx,8) C dst[0]
! 595:
! 596:
! 597: L(diag):
! 598: C eax scratch
! 599: C ebx scratch
! 600: C ecx counter, negative
! 601: C edx carry
! 602: C esi &src[size]
! 603: C edi dst[2*size-2]
! 604: C ebp
! 605:
! 606: movl (%esi,%ecx,4), %eax
! 607: movl %edx, %ebx
! 608:
! 609: mull %eax
! 610:
! 611: addl %ebx, 4(%edi,%ecx,8)
! 612: adcl %eax, 8(%edi,%ecx,8)
! 613: adcl $0, %edx
! 614:
! 615: incl %ecx
! 616: jnz L(diag)
! 617:
! 618:
! 619: movl SAVE_ESI, %esi
! 620: movl SAVE_EBX, %ebx
! 621:
! 622: addl %edx, 4(%edi) C dst most significant limb
! 623:
! 624: movl SAVE_EDI, %edi
! 625: movl SAVE_EBP, %ebp
! 626: addl $FRAME, %esp
! 627: ret
! 628:
! 629:
! 630:
! 631: C -----------------------------------------------------------------------------
! 632: ifdef(`PIC',`
! 633: L(pic_calc):
! 634: addl (%esp), %ecx
! 635: addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
! 636: addl %edx, %ecx
! 637: ret
! 638: ')
! 639:
! 640:
! 641: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>