Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/sqr_basecase.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
! 2: dnl
! 3: dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
! 4: dnl product at around 20x20 limbs.
! 5:
! 6:
! 7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 8: dnl
! 9: dnl This file is part of the GNU MP Library.
! 10: dnl
! 11: dnl The GNU MP Library is free software; you can redistribute it and/or
! 12: dnl modify it under the terms of the GNU Lesser General Public License as
! 13: dnl published by the Free Software Foundation; either version 2.1 of the
! 14: dnl License, or (at your option) any later version.
! 15: dnl
! 16: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 19: dnl Lesser General Public License for more details.
! 20: dnl
! 21: dnl You should have received a copy of the GNU Lesser General Public
! 22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 24: dnl Suite 330, Boston, MA 02111-1307, USA.
! 25:
! 26:
! 27: include(`../config.m4')
! 28:
! 29:
! 30: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
! 31: C
! 32: C Calculate src,size squared, storing the result in dst,2*size.
! 33: C
! 34: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
! 35: C lot of function call overheads are avoided, especially when the size is
! 36: C small.
! 37:
! 38: defframe(PARAM_SIZE,12)
! 39: defframe(PARAM_SRC, 8)
! 40: defframe(PARAM_DST, 4)
! 41:
! 42: .text
! 43: ALIGN(8)
! 44: PROLOGUE(mpn_sqr_basecase)
! 45: deflit(`FRAME',0)
! 46:
! 47: movl PARAM_SIZE, %edx
! 48: movl PARAM_SRC, %eax
! 49:
! 50: cmpl $2, %edx
! 51: movl PARAM_DST, %ecx
! 52:
! 53: je L(two_limbs)
! 54:
! 55: movl (%eax), %eax
! 56: ja L(three_or_more)
! 57:
! 58: C -----------------------------------------------------------------------------
! 59: C one limb only
! 60: C eax src
! 61: C ebx
! 62: C ecx dst
! 63: C edx
! 64:
! 65: mull %eax
! 66:
! 67: movl %eax, (%ecx)
! 68: movl %edx, 4(%ecx)
! 69:
! 70: ret
! 71:
! 72: C -----------------------------------------------------------------------------
! 73: ALIGN(8)
! 74: L(two_limbs):
! 75: C eax src
! 76: C ebx
! 77: C ecx dst
! 78: C edx size
! 79:
! 80: pushl %ebp
! 81: pushl %edi
! 82:
! 83: pushl %esi
! 84: pushl %ebx
! 85:
! 86: movl %eax, %ebx
! 87: movl (%eax), %eax
! 88:
! 89: mull %eax C src[0]^2
! 90:
! 91: movl %eax, (%ecx) C dst[0]
! 92: movl %edx, %esi C dst[1]
! 93:
! 94: movl 4(%ebx), %eax
! 95:
! 96: mull %eax C src[1]^2
! 97:
! 98: movl %eax, %edi C dst[2]
! 99: movl %edx, %ebp C dst[3]
! 100:
! 101: movl (%ebx), %eax
! 102:
! 103: mull 4(%ebx) C src[0]*src[1]
! 104:
! 105: addl %eax, %esi
! 106: popl %ebx
! 107:
! 108: adcl %edx, %edi
! 109:
! 110: adcl $0, %ebp
! 111: addl %esi, %eax
! 112:
! 113: adcl %edi, %edx
! 114: movl %eax, 4(%ecx)
! 115:
! 116: adcl $0, %ebp
! 117: popl %esi
! 118:
! 119: movl %edx, 8(%ecx)
! 120: movl %ebp, 12(%ecx)
! 121:
! 122: popl %edi
! 123: popl %ebp
! 124:
! 125: ret
! 126:
! 127:
! 128: C -----------------------------------------------------------------------------
! 129: ALIGN(8)
! 130: L(three_or_more):
! 131: C eax src low limb
! 132: C ebx
! 133: C ecx dst
! 134: C edx size
! 135:
! 136: cmpl $4, %edx
! 137: pushl %ebx
! 138: deflit(`FRAME',4)
! 139:
! 140: movl PARAM_SRC, %ebx
! 141: jae L(four_or_more)
! 142:
! 143:
! 144: C -----------------------------------------------------------------------------
! 145: C three limbs
! 146: C eax src low limb
! 147: C ebx src
! 148: C ecx dst
! 149: C edx size
! 150:
! 151: pushl %ebp
! 152: pushl %edi
! 153:
! 154: mull %eax C src[0] ^ 2
! 155:
! 156: movl %eax, (%ecx)
! 157: movl %edx, 4(%ecx)
! 158:
! 159: movl 4(%ebx), %eax
! 160: xorl %ebp, %ebp
! 161:
! 162: mull %eax C src[1] ^ 2
! 163:
! 164: movl %eax, 8(%ecx)
! 165: movl %edx, 12(%ecx)
! 166:
! 167: movl 8(%ebx), %eax
! 168: pushl %esi C risk of cache bank clash
! 169:
! 170: mull %eax C src[2] ^ 2
! 171:
! 172: movl %eax, 16(%ecx)
! 173: movl %edx, 20(%ecx)
! 174:
! 175: movl (%ebx), %eax
! 176:
! 177: mull 4(%ebx) C src[0] * src[1]
! 178:
! 179: movl %eax, %esi
! 180: movl %edx, %edi
! 181:
! 182: movl (%ebx), %eax
! 183:
! 184: mull 8(%ebx) C src[0] * src[2]
! 185:
! 186: addl %eax, %edi
! 187: movl %edx, %ebp
! 188:
! 189: adcl $0, %ebp
! 190: movl 4(%ebx), %eax
! 191:
! 192: mull 8(%ebx) C src[1] * src[2]
! 193:
! 194: xorl %ebx, %ebx
! 195: addl %eax, %ebp
! 196:
! 197: C eax
! 198: C ebx zero, will be dst[5]
! 199: C ecx dst
! 200: C edx dst[4]
! 201: C esi dst[1]
! 202: C edi dst[2]
! 203: C ebp dst[3]
! 204:
! 205: adcl $0, %edx
! 206: addl %esi, %esi
! 207:
! 208: adcl %edi, %edi
! 209:
! 210: adcl %ebp, %ebp
! 211:
! 212: adcl %edx, %edx
! 213: movl 4(%ecx), %eax
! 214:
! 215: adcl $0, %ebx
! 216: addl %esi, %eax
! 217:
! 218: movl %eax, 4(%ecx)
! 219: movl 8(%ecx), %eax
! 220:
! 221: adcl %edi, %eax
! 222: movl 12(%ecx), %esi
! 223:
! 224: adcl %ebp, %esi
! 225: movl 16(%ecx), %edi
! 226:
! 227: movl %eax, 8(%ecx)
! 228: movl %esi, 12(%ecx)
! 229:
! 230: adcl %edx, %edi
! 231: popl %esi
! 232:
! 233: movl 20(%ecx), %eax
! 234: movl %edi, 16(%ecx)
! 235:
! 236: popl %edi
! 237: popl %ebp
! 238:
! 239: adcl %ebx, %eax C no carry out of this
! 240: popl %ebx
! 241:
! 242: movl %eax, 20(%ecx)
! 243:
! 244: ret
! 245:
! 246:
! 247: C -----------------------------------------------------------------------------
! 248: ALIGN(8)
! 249: L(four_or_more):
! 250: C eax src low limb
! 251: C ebx src
! 252: C ecx dst
! 253: C edx size
! 254: C esi
! 255: C edi
! 256: C ebp
! 257: C
! 258: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
! 259:
! 260: deflit(`FRAME',4)
! 261:
! 262: pushl %edi
! 263: FRAME_pushl()
! 264: pushl %esi
! 265: FRAME_pushl()
! 266:
! 267: pushl %ebp
! 268: FRAME_pushl()
! 269: leal (%ecx,%edx,4), %edi C dst end of this mul1
! 270:
! 271: leal (%ebx,%edx,4), %esi C src end
! 272: movl %ebx, %ebp C src
! 273:
! 274: negl %edx C -size
! 275: xorl %ebx, %ebx C clear carry limb and carry flag
! 276:
! 277: leal 1(%edx), %ecx C -(size-1)
! 278:
! 279: L(mul1):
! 280: C eax scratch
! 281: C ebx carry
! 282: C ecx counter, negative
! 283: C edx scratch
! 284: C esi &src[size]
! 285: C edi &dst[size]
! 286: C ebp src
! 287:
! 288: adcl $0, %ebx
! 289: movl (%esi,%ecx,4), %eax
! 290:
! 291: mull (%ebp)
! 292:
! 293: addl %eax, %ebx
! 294:
! 295: movl %ebx, (%edi,%ecx,4)
! 296: incl %ecx
! 297:
! 298: movl %edx, %ebx
! 299: jnz L(mul1)
! 300:
! 301:
! 302: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
! 303: C n=1..size-2.
! 304: C
! 305: C The last two products, which are the end corner of the product
! 306: C triangle, are handled separately to save looping overhead. These
! 307: C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
! 308: C If size is 4 then it's only these that need to be done.
! 309: C
! 310: C In the outer loop %esi is a constant, and %edi just advances by 1
! 311: C limb each time. The size of the operation decreases by 1 limb
! 312: C each time.
! 313:
! 314: C eax
! 315: C ebx carry (needing carry flag added)
! 316: C ecx
! 317: C edx
! 318: C esi &src[size]
! 319: C edi &dst[size]
! 320: C ebp
! 321:
! 322: adcl $0, %ebx
! 323: movl PARAM_SIZE, %edx
! 324:
! 325: movl %ebx, (%edi)
! 326: subl $4, %edx
! 327:
! 328: negl %edx
! 329: jz L(corner)
! 330:
! 331:
! 332: L(outer):
! 333: C ebx previous carry limb to store
! 334: C edx outer loop counter (negative)
! 335: C esi &src[size]
! 336: C edi dst, pointing at stored carry limb of previous loop
! 337:
! 338: pushl %edx C new outer loop counter
! 339: leal -2(%edx), %ecx
! 340:
! 341: movl %ebx, (%edi)
! 342: addl $4, %edi
! 343:
! 344: addl $4, %ebp
! 345: xorl %ebx, %ebx C initial carry limb, clear carry flag
! 346:
! 347: L(inner):
! 348: C eax scratch
! 349: C ebx carry (needing carry flag added)
! 350: C ecx counter, negative
! 351: C edx scratch
! 352: C esi &src[size]
! 353: C edi dst end of this addmul
! 354: C ebp &src[j]
! 355:
! 356: adcl $0, %ebx
! 357: movl (%esi,%ecx,4), %eax
! 358:
! 359: mull (%ebp)
! 360:
! 361: addl %ebx, %eax
! 362: movl (%edi,%ecx,4), %ebx
! 363:
! 364: adcl $0, %edx
! 365: addl %eax, %ebx
! 366:
! 367: movl %ebx, (%edi,%ecx,4)
! 368: incl %ecx
! 369:
! 370: movl %edx, %ebx
! 371: jnz L(inner)
! 372:
! 373:
! 374: adcl $0, %ebx
! 375: popl %edx C outer loop counter
! 376:
! 377: incl %edx
! 378: jnz L(outer)
! 379:
! 380:
! 381: movl %ebx, (%edi)
! 382:
! 383: L(corner):
! 384: C esi &src[size]
! 385: C edi &dst[2*size-4]
! 386:
! 387: movl -8(%esi), %eax
! 388: movl -4(%edi), %ebx C risk of data cache bank clash here
! 389:
! 390: mull -12(%esi) C src[size-2]*src[size-3]
! 391:
! 392: addl %eax, %ebx
! 393: movl %edx, %ecx
! 394:
! 395: adcl $0, %ecx
! 396: movl -4(%esi), %eax
! 397:
! 398: mull -12(%esi) C src[size-1]*src[size-3]
! 399:
! 400: addl %ecx, %eax
! 401: movl (%edi), %ecx
! 402:
! 403: adcl $0, %edx
! 404: movl %ebx, -4(%edi)
! 405:
! 406: addl %eax, %ecx
! 407: movl %edx, %ebx
! 408:
! 409: adcl $0, %ebx
! 410: movl -4(%esi), %eax
! 411:
! 412: mull -8(%esi) C src[size-1]*src[size-2]
! 413:
! 414: movl %ecx, 0(%edi)
! 415: addl %eax, %ebx
! 416:
! 417: adcl $0, %edx
! 418: movl PARAM_SIZE, %eax
! 419:
! 420: negl %eax
! 421: movl %ebx, 4(%edi)
! 422:
! 423: addl $1, %eax C -(size-1) and clear carry
! 424: movl %edx, 8(%edi)
! 425:
! 426:
! 427: C -----------------------------------------------------------------------------
! 428: C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
! 429:
! 430: L(lshift):
! 431: C eax counter, negative
! 432: C ebx next limb
! 433: C ecx
! 434: C edx
! 435: C esi
! 436: C edi &dst[2*size-4]
! 437: C ebp
! 438:
! 439: movl 12(%edi,%eax,8), %ebx
! 440:
! 441: rcll %ebx
! 442: movl 16(%edi,%eax,8), %ecx
! 443:
! 444: rcll %ecx
! 445: movl %ebx, 12(%edi,%eax,8)
! 446:
! 447: movl %ecx, 16(%edi,%eax,8)
! 448: incl %eax
! 449:
! 450: jnz L(lshift)
! 451:
! 452:
! 453: adcl %eax, %eax C high bit out
! 454: movl PARAM_SRC, %esi
! 455:
! 456: movl PARAM_SIZE, %ecx C risk of cache bank clash
! 457: movl %eax, 12(%edi) C dst most significant limb
! 458:
! 459:
! 460: C -----------------------------------------------------------------------------
! 461: C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
! 462: C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
! 463: C low limb of src[0]^2.
! 464:
! 465: movl (%esi), %eax C src[0]
! 466: leal (%esi,%ecx,4), %esi C src end
! 467:
! 468: negl %ecx
! 469:
! 470: mull %eax
! 471:
! 472: movl %eax, 16(%edi,%ecx,8) C dst[0]
! 473: movl %edx, %ebx
! 474:
! 475: addl $1, %ecx C size-1 and clear carry
! 476:
! 477: L(diag):
! 478: C eax scratch (low product)
! 479: C ebx carry limb
! 480: C ecx counter, negative
! 481: C edx scratch (high product)
! 482: C esi &src[size]
! 483: C edi &dst[2*size-4]
! 484: C ebp scratch (fetched dst limbs)
! 485:
! 486: movl (%esi,%ecx,4), %eax
! 487: adcl $0, %ebx
! 488:
! 489: mull %eax
! 490:
! 491: movl 16-4(%edi,%ecx,8), %ebp
! 492:
! 493: addl %ebp, %ebx
! 494: movl 16(%edi,%ecx,8), %ebp
! 495:
! 496: adcl %eax, %ebp
! 497: movl %ebx, 16-4(%edi,%ecx,8)
! 498:
! 499: movl %ebp, 16(%edi,%ecx,8)
! 500: incl %ecx
! 501:
! 502: movl %edx, %ebx
! 503: jnz L(diag)
! 504:
! 505:
! 506: adcl $0, %edx
! 507: movl 16-4(%edi), %eax C dst most significant limb
! 508:
! 509: addl %eax, %edx
! 510: popl %ebp
! 511:
! 512: movl %edx, 16-4(%edi)
! 513: popl %esi C risk of cache bank clash
! 514:
! 515: popl %edi
! 516: popl %ebx
! 517:
! 518: ret
! 519:
! 520: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>