Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Alpha ev6 nails mpn_addmul_1.
! 2:
! 3: dnl Copyright 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24: dnl INPUT PARAMETERS
! 25: define(`rp',`r16')
! 26: define(`up',`r17')
! 27: define(`n',`r18')
! 28: define(`vl0',`r19')
! 29:
! 30: define(`numb_mask',`r14')
! 31:
! 32: define(`m0a',`r0')
! 33: define(`m0b',`r1')
! 34: define(`m1a',`r2')
! 35: define(`m1b',`r3')
! 36: define(`m2a',`r20')
! 37: define(`m2b',`r21')
! 38: define(`m3a',`r12')
! 39: define(`m3b',`r13')
! 40:
! 41: define(`acc0',`r9')
! 42: define(`acc1',`r27')
! 43:
! 44: define(`ul0',`r4')
! 45: define(`ul1',`r5')
! 46: define(`ul2',`r6')
! 47: define(`ul3',`r7')
! 48:
! 49: define(`rl0',`r22')
! 50: define(`rl1',`r23')
! 51: define(`rl2',`r24')
! 52: define(`rl3',`r25')
! 53:
! 54: C unused scratch
! 55: C unused saved r10 r11
! 56:
! 57: define(`NAIL_BITS',`GMP_NAIL_BITS')
! 58: define(`NUMB_BITS',`GMP_NUMB_BITS')
! 59:
! 60: dnl This declaration is munged by configure
! 61: NAILS_SUPPORT(2-63)
! 62:
! 63: dnl Runs at 4.5 cycles/limb. Local scheduling should bring that down to 3.5
! 64: dnl cycles/limb. It would be possible to reach 3.25 cycles/limb with 8-way
! 65: dnl unrolling.
! 66:
! 67: dnl Register usage:
! 68: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
! 69: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
! 70: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
! 71: dnl return address: 26
! 72: dnl global pointer: 29
! 73: dnl stack pointer: 30
! 74:
! 75: ASM_START()
! 76: PROLOGUE(mpn_addmul_1)
! 77: lda r30, -240(r30)
! 78: stq r9, 8(r30)
! 79: C stq r10, 16(r30)
! 80: C stq r11, 24(r30)
! 81: stq r12, 32(r30)
! 82: stq r13, 40(r30)
! 83: stq r14, 48(r30)
! 84: stq r15, 56(r30)
! 85:
! 86: sll vl0, NAIL_BITS, vl0
! 87: lda numb_mask, -1(r31)
! 88: srl numb_mask, NAIL_BITS, numb_mask
! 89:
! 90: bic r31, r31, r15
! 91: bic r31, r31, m3b
! 92:
! 93: and n, 3, r25
! 94: beq r25, L4
! 95: Loop0:
! 96: ldq ul0, 0(up)
! 97: ldq rl0, 0(rp)
! 98: mulq vl0, ul0, m0a C U1
! 99: srl m0a,NAIL_BITS, r8
! 100: addq r8, m3b, acc0
! 101: addq rl0, acc0, acc0
! 102: addq r15, acc0, acc0
! 103: umulh vl0, ul0, m3b C U1
! 104: srl acc0,NUMB_BITS, r15
! 105: and acc0,numb_mask, r28
! 106: stq r28, 0(rp)
! 107: lda rp, 8(rp)
! 108: lda up, 8(up)
! 109: lda r25, -1(r25)
! 110: bne r25, Loop0
! 111:
! 112: L4:
! 113: lda n, -4(n)
! 114: bge n, L_4_or_more
! 115: L_0_to_3:
! 116: addq m3b, r15, r0
! 117: br r31, Lret
! 118:
! 119: L_4_or_more:
! 120: ldq ul0, 0(up)
! 121: ldq ul1, 8(up)
! 122: ldq ul2, 16(up)
! 123: ldq ul3, 24(up)
! 124: ldq rl0, 0(rp)
! 125: lda n, -4(n)
! 126: lda up, 32(up)
! 127: bge n, L_8_or_more
! 128: L_4_to_8:
! 129: mulq vl0, ul0, m0a C U1
! 130: umulh vl0, ul0, m0b C U1
! 131: ldq rl1, 8(rp)
! 132: mulq vl0, ul1, m1a C U1
! 133: umulh vl0, ul1, m1b C U1
! 134: ldq rl2, 16(rp)
! 135: mulq vl0, ul2, m2a C U1
! 136: umulh vl0, ul2, m2b C U1
! 137: srl m0a,NAIL_BITS, r8
! 138: ldq rl3, 24(rp)
! 139: mulq vl0, ul3, m3a C U1
! 140: addq r8, m3b, acc0
! 141: umulh vl0, ul3, m3b C U1
! 142: addq rl0, acc0, acc0
! 143: srl m1a,NAIL_BITS, r8
! 144: addq r15, acc0, acc0
! 145:
! 146: addq r8, m0b, acc1
! 147: srl acc0,NUMB_BITS, r15
! 148: addq rl1, acc1, acc1
! 149: and acc0,numb_mask, r28
! 150: srl m2a,NAIL_BITS, r8
! 151: addq r15, acc1, acc1
! 152: bis r31, r31, r31 C nop
! 153: addq r8, m1b, acc0
! 154: srl acc1,NUMB_BITS, r15
! 155: stq r28, 0(rp)
! 156: addq rl2, acc0, acc0
! 157: and acc1,numb_mask, r28
! 158: srl m3a,NAIL_BITS, r8
! 159: addq r15, acc0, acc0
! 160: bis r31, r31, r31 C nop
! 161: addq r8, m2b, acc1
! 162: srl acc0,NUMB_BITS, r15
! 163: stq r28, 8(rp)
! 164: addq rl3, acc1, acc1
! 165: and acc0,numb_mask, r28
! 166: addq r15, acc1, acc1
! 167: bis r31, r31, r31 C nop
! 168: srl acc1,NUMB_BITS, r15
! 169: stq r28, 16(rp)
! 170: and acc1,numb_mask, r28
! 171: addq m3b, r15, acc0
! 172: stq r28, 24(rp)
! 173: and acc0,numb_mask, r0
! 174:
! 175: br r31, Lret
! 176:
! 177: L_8_or_more:
! 178: mulq vl0, ul0, m0a C U1
! 179: umulh vl0, ul0, m0b C U1
! 180: ldq ul0, 0(up)
! 181: ldq rl1, 8(rp)
! 182: mulq vl0, ul1, m1a C U1
! 183: umulh vl0, ul1, m1b C U1
! 184: ldq ul1, 8(up)
! 185: ldq rl2, 16(rp)
! 186: mulq vl0, ul2, m2a C U1
! 187: umulh vl0, ul2, m2b C U1
! 188: ldq ul2, 16(up)
! 189: srl m0a,NAIL_BITS, r8
! 190: ldq rl3, 24(rp)
! 191: mulq vl0, ul3, m3a C U1
! 192: addq r8, m3b, acc0
! 193: umulh vl0, ul3, m3b C U1
! 194: ldq ul3, 24(up)
! 195: addq rl0, acc0, acc0
! 196: srl m1a,NAIL_BITS, r8
! 197: ldq rl0, 32(rp)
! 198: addq r15, acc0, acc0
! 199: lda n, -4(n)
! 200: lda up, 32(up)
! 201: lda rp, 32(rp)
! 202: bge n, L_12_or_more C U0
! 203: L_8_to_11:
! 204: mulq vl0, ul0, m0a C U1
! 205: addq r8, m0b, acc1
! 206: srl acc0,NUMB_BITS, r15
! 207: umulh vl0, ul0, m0b C U1
! 208: addq rl1, acc1, acc1
! 209: and acc0,numb_mask, r28
! 210: srl m2a,NAIL_BITS, r8
! 211: ldq rl1, 8(rp)
! 212: addq r15, acc1, acc1
! 213: bis r31, r31, r31 C nop
! 214: mulq vl0, ul1, m1a C U1
! 215: addq r8, m1b, acc0
! 216: srl acc1,NUMB_BITS, r15
! 217: stq r28, -32(rp)
! 218: umulh vl0, ul1, m1b C U1
! 219: addq rl2, acc0, acc0
! 220: and acc1,numb_mask, r28
! 221: srl m3a,NAIL_BITS, r8
! 222: ldq rl2, 16(rp)
! 223: addq r15, acc0, acc0
! 224: bis r31, r31, r31 C nop
! 225: mulq vl0, ul2, m2a C U1
! 226: addq r8, m2b, acc1
! 227: srl acc0,NUMB_BITS, r15
! 228: stq r28, -24(rp)
! 229: umulh vl0, ul2, m2b C U1
! 230: addq rl3, acc1, acc1
! 231: and acc0,numb_mask, r28
! 232: srl m0a,NAIL_BITS, r8
! 233: ldq rl3, 24(rp)
! 234: addq r15, acc1, acc1
! 235: bis r31, r31, r31 C nop
! 236: mulq vl0, ul3, m3a C U1
! 237: addq r8, m3b, acc0
! 238: srl acc1,NUMB_BITS, r15
! 239: stq r28, -16(rp)
! 240: umulh vl0, ul3, m3b C U1
! 241: addq rl0, acc0, acc0
! 242: and acc1,numb_mask, r28
! 243: srl m1a,NAIL_BITS, r8
! 244: addq r15, acc0, acc0
! 245:
! 246: addq r8, m0b, acc1
! 247: srl acc0,NUMB_BITS, r15
! 248: stq r28, -8(rp)
! 249: addq rl1, acc1, acc1
! 250: and acc0,numb_mask, r28
! 251: srl m2a,NAIL_BITS, r8
! 252: addq r15, acc1, acc1
! 253: bis r31, r31, r31 C nop
! 254: addq r8, m1b, acc0
! 255: srl acc1,NUMB_BITS, r15
! 256: stq r28, 0(rp)
! 257: addq rl2, acc0, acc0
! 258: and acc1,numb_mask, r28
! 259: srl m3a,NAIL_BITS, r8
! 260: addq r15, acc0, acc0
! 261: bis r31, r31, r31 C nop
! 262: addq r8, m2b, acc1
! 263: srl acc0,NUMB_BITS, r15
! 264: stq r28, 8(rp)
! 265: addq rl3, acc1, acc1
! 266: and acc0,numb_mask, r28
! 267: addq r15, acc1, acc1
! 268: bis r31, r31, r31 C nop
! 269: srl acc1,NUMB_BITS, r15
! 270: stq r28, 16(rp)
! 271: and acc1,numb_mask, r28
! 272: addq m3b, r15, acc0
! 273: stq r28, 24(rp)
! 274: and acc0,numb_mask, r0
! 275:
! 276: br r31, Lret
! 277:
! 278: L_12_or_more:
! 279: mulq vl0, ul0, m0a C U1
! 280: addq r8, m0b, acc1
! 281: srl acc0,NUMB_BITS, r15
! 282: umulh vl0, ul0, m0b C U1
! 283: ldq ul0, 0(up)
! 284: addq rl1, acc1, acc1
! 285: and acc0,numb_mask, r28
! 286: srl m2a,NAIL_BITS, r8
! 287: ldq rl1, 8(rp)
! 288: addq r15, acc1, acc1
! 289: bis r31, r31, r31 C nop
! 290: mulq vl0, ul1, m1a C U1
! 291: addq r8, m1b, acc0
! 292: srl acc1,NUMB_BITS, r15
! 293: stq r28, -32(rp)
! 294: umulh vl0, ul1, m1b C U1
! 295: ldq ul1, 8(up)
! 296: addq rl2, acc0, acc0
! 297: and acc1,numb_mask, r28
! 298: srl m3a,NAIL_BITS, r8
! 299: ldq rl2, 16(rp)
! 300: addq r15, acc0, acc0
! 301: bis r31, r31, r31 C nop
! 302: mulq vl0, ul2, m2a C U1
! 303: addq r8, m2b, acc1
! 304: srl acc0,NUMB_BITS, r15
! 305: stq r28, -24(rp)
! 306: umulh vl0, ul2, m2b C U1
! 307: ldq ul2, 16(up)
! 308: addq rl3, acc1, acc1
! 309: and acc0,numb_mask, r28
! 310: srl m0a,NAIL_BITS, r8
! 311: ldq rl3, 24(rp)
! 312: addq r15, acc1, acc1
! 313: bis r31, r31, r31 C nop
! 314: mulq vl0, ul3, m3a C U1
! 315: addq r8, m3b, acc0
! 316: srl acc1,NUMB_BITS, r15
! 317: stq r28, -16(rp)
! 318: umulh vl0, ul3, m3b C U1
! 319: ldq ul3, 24(up)
! 320: addq rl0, acc0, acc0
! 321: and acc1,numb_mask, r28
! 322: srl m1a,NAIL_BITS, r8
! 323: ldq rl0, 32(rp)
! 324: addq r15, acc0, acc0
! 325: bis r31, r31, r31 C nop
! 326: bis r31, r31, r31 C nop
! 327: bis r31, r31, r31 C nop
! 328: bis r31, r31, r31 C nop
! 329: bis r31, r31, r31 C nop
! 330: lda n, -4(n)
! 331: lda up, 32(up)
! 332: lda rp, 32(rp)
! 333: blt n, L_end C U0
! 334:
! 335: Loop:
! 336: C
! 337: mulq vl0, ul0, m0a C U1
! 338: addq r8, m0b, acc1
! 339: srl acc0,NUMB_BITS, r15
! 340: stq r28, -40(rp)
! 341: C
! 342: umulh vl0, ul0, m0b C U1
! 343: ldq ul0, 0(up)
! 344: addq rl1, acc1, acc1
! 345: and acc0,numb_mask, r28
! 346: C
! 347: srl m2a,NAIL_BITS, r8
! 348: ldq rl1, 8(rp)
! 349: addq r15, acc1, acc1
! 350: bis r31, r31, r31 C nop
! 351: C
! 352: mulq vl0, ul1, m1a C U1
! 353: addq r8, m1b, acc0
! 354: srl acc1,NUMB_BITS, r15
! 355: stq r28, -32(rp)
! 356: C
! 357: umulh vl0, ul1, m1b C U1
! 358: ldq ul1, 8(up)
! 359: addq rl2, acc0, acc0
! 360: and acc1,numb_mask, r28
! 361: C
! 362: srl m3a,NAIL_BITS, r8
! 363: ldq rl2, 16(rp)
! 364: addq r15, acc0, acc0
! 365: bis r31, r31, r31 C nop
! 366: C
! 367: mulq vl0, ul2, m2a C U1
! 368: addq r8, m2b, acc1
! 369: srl acc0,NUMB_BITS, r15
! 370: stq r28, -24(rp)
! 371: C
! 372: umulh vl0, ul2, m2b C U1
! 373: ldq ul2, 16(up)
! 374: addq rl3, acc1, acc1
! 375: and acc0,numb_mask, r28
! 376: C
! 377: srl m0a,NAIL_BITS, r8
! 378: ldq rl3, 24(rp)
! 379: addq r15, acc1, acc1
! 380: bis r31, r31, r31 C nop
! 381: C
! 382: mulq vl0, ul3, m3a C U1
! 383: addq r8, m3b, acc0
! 384: srl acc1,NUMB_BITS, r15
! 385: stq r28, -16(rp)
! 386: C
! 387: umulh vl0, ul3, m3b C U1
! 388: ldq ul3, 24(up)
! 389: addq rl0, acc0, acc0
! 390: and acc1,numb_mask, r28
! 391: C
! 392: srl m1a,NAIL_BITS, r8
! 393: ldq rl0, 32(rp)
! 394: addq r15, acc0, acc0
! 395: bis r31, r31, r31 C nop
! 396: C
! 397: bis r31, r31, r31 C nop
! 398: bis r31, r31, r31 C nop
! 399: bis r31, r31, r31 C nop
! 400: bis r31, r31, r31 C nop
! 401: C
! 402: lda n, -4(n)
! 403: lda up, 32(up)
! 404: lda rp, 32(rp)
! 405: bge n, Loop C U0
! 406:
! 407: L_end:
! 408: mulq vl0, ul0, m0a C U1
! 409: addq r8, m0b, acc1
! 410: srl acc0,NUMB_BITS, r15
! 411: stq r28, -40(rp)
! 412: umulh vl0, ul0, m0b C U1
! 413: addq rl1, acc1, acc1
! 414: and acc0,numb_mask, r28
! 415: srl m2a,NAIL_BITS, r8
! 416: ldq rl1, 8(rp)
! 417: addq r15, acc1, acc1
! 418: bis r31, r31, r31 C nop
! 419: mulq vl0, ul1, m1a C U1
! 420: addq r8, m1b, acc0
! 421: srl acc1,NUMB_BITS, r15
! 422: stq r28, -32(rp)
! 423: umulh vl0, ul1, m1b C U1
! 424: addq rl2, acc0, acc0
! 425: and acc1,numb_mask, r28
! 426: srl m3a,NAIL_BITS, r8
! 427: ldq rl2, 16(rp)
! 428: addq r15, acc0, acc0
! 429: bis r31, r31, r31 C nop
! 430: mulq vl0, ul2, m2a C U1
! 431: addq r8, m2b, acc1
! 432: srl acc0,NUMB_BITS, r15
! 433: stq r28, -24(rp)
! 434: umulh vl0, ul2, m2b C U1
! 435: addq rl3, acc1, acc1
! 436: and acc0,numb_mask, r28
! 437: srl m0a,NAIL_BITS, r8
! 438: ldq rl3, 24(rp)
! 439: addq r15, acc1, acc1
! 440: bis r31, r31, r31 C nop
! 441: mulq vl0, ul3, m3a C U1
! 442: addq r8, m3b, acc0
! 443: srl acc1,NUMB_BITS, r15
! 444: stq r28, -16(rp)
! 445: umulh vl0, ul3, m3b C U1
! 446: addq rl0, acc0, acc0
! 447: and acc1,numb_mask, r28
! 448: srl m1a,NAIL_BITS, r8
! 449: addq r15, acc0, acc0
! 450: bis r31, r31, r31 C nop
! 451: bis r31, r31, r31 C nop
! 452: bis r31, r31, r31 C nop
! 453: bis r31, r31, r31 C nop
! 454: bis r31, r31, r31 C nop
! 455: lda rp, 32(rp)
! 456:
! 457: addq r8, m0b, acc1
! 458: srl acc0,NUMB_BITS, r15
! 459: stq r28, -40(rp)
! 460: addq rl1, acc1, acc1
! 461: and acc0,numb_mask, r28
! 462: srl m2a,NAIL_BITS, r8
! 463: addq r15, acc1, acc1
! 464: bis r31, r31, r31 C nop
! 465: addq r8, m1b, acc0
! 466: srl acc1,NUMB_BITS, r15
! 467: stq r28, -32(rp)
! 468: addq rl2, acc0, acc0
! 469: and acc1,numb_mask, r28
! 470: srl m3a,NAIL_BITS, r8
! 471: addq r15, acc0, acc0
! 472: bis r31, r31, r31 C nop
! 473: addq r8, m2b, acc1
! 474: srl acc0,NUMB_BITS, r15
! 475: stq r28, -24(rp)
! 476: addq rl3, acc1, acc1
! 477: and acc0,numb_mask, r28
! 478: addq r15, acc1, acc1
! 479: bis r31, r31, r31 C nop
! 480: srl acc1,NUMB_BITS, r15
! 481: stq r28, -16(rp)
! 482: and acc1,numb_mask, r28
! 483: addq m3b, r15, acc0
! 484: stq r28, -8(rp)
! 485: and acc0,numb_mask, r0
! 486: Lret:
! 487: ldq r9, 8(r30)
! 488: C ldq r10, 16(r30)
! 489: C ldq r11, 24(r30)
! 490: ldq r12, 32(r30)
! 491: ldq r13, 40(r30)
! 492: ldq r14, 48(r30)
! 493: ldq r15, 56(r30)
! 494: lda r30, 240(r30)
! 495: ret r31, (r26), 1
! 496: EPILOGUE(mpn_addmul_1)
! 497: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>