Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/mul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Alpha ev6 nails mpn_mul_1.
! 2:
! 3: dnl Copyright 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24: dnl INPUT PARAMETERS
! 25: define(`rp',`r16')
! 26: define(`up',`r17')
! 27: define(`n',`r18')
! 28: define(`vl0',`r19')
! 29:
! 30: define(`numb_mask',`r24')
! 31:
! 32: define(`m0a',`r0')
! 33: define(`m0b',`r1')
! 34: define(`m1a',`r2')
! 35: define(`m1b',`r3')
! 36: define(`m2a',`r20')
! 37: define(`m2b',`r21')
! 38: define(`m3a',`r22')
! 39: define(`m3b',`r23')
! 40:
! 41: define(`acc0',`r27')
! 42: define(`acc1',`r25')
! 43:
! 44: define(`ul0',`r4')
! 45: define(`ul1',`r5')
! 46: define(`ul2',`r6')
! 47: define(`ul3',`r7')
! 48:
! 49: C unused scratch
! 50: C unused saved r10 r11
! 51:
! 52: define(`NAIL_BITS',`GMP_NAIL_BITS')
! 53: define(`NUMB_BITS',`GMP_NUMB_BITS')
! 54:
! 55: dnl This declaration is munged by configure
! 56: NAILS_SUPPORT(1-63)
! 57:
! 58: dnl Runs at 3.5 cycles/limb. Naively made from addmul_1.asm. A better
! 59: dnl implementation could bring speed to 2.75 cycles/limb.
! 60:
! 61: dnl Register usage:
! 62: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
! 63: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
! 64: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
! 65: dnl return address: 26
! 66: dnl global pointer: 29
! 67: dnl stack pointer: 30
! 68:
! 69: ASM_START()
! 70: PROLOGUE(mpn_mul_1)
! 71: lda r30, -240(r30)
! 72: C stq r9, 8(r30)
! 73: C stq r10, 16(r30)
! 74: C stq r11, 24(r30)
! 75: C stq r12, 32(r30)
! 76: C stq r13, 40(r30)
! 77: C stq r14, 48(r30)
! 78: stq r15, 56(r30)
! 79:
! 80: sll vl0, NAIL_BITS, vl0
! 81: lda numb_mask, -1(r31)
! 82: srl numb_mask, NAIL_BITS, numb_mask
! 83:
! 84: bic r31, r31, r15
! 85: bic r31, r31, m3b
! 86:
! 87: and n, 3, r25
! 88: beq r25, L4
! 89: Loop0:
! 90: ldq ul0, 0(up)
! 91: mulq vl0, ul0, m0a C U1
! 92: srl m0a,NAIL_BITS, r8
! 93: addq r8, m3b, acc0
! 94: addq r15, acc0, acc0
! 95: umulh vl0, ul0, m3b C U1
! 96: srl acc0,NUMB_BITS, r15
! 97: and acc0,numb_mask, r28
! 98: stq r28, 0(rp)
! 99: lda rp, 8(rp)
! 100: lda up, 8(up)
! 101: lda r25, -1(r25)
! 102: bne r25, Loop0
! 103:
! 104: L4:
! 105: lda n, -4(n)
! 106: bge n, L_4_or_more
! 107: L_0_to_3:
! 108: addq m3b, r15, r0
! 109: br r31, Lret
! 110:
! 111: L_4_or_more:
! 112: ldq ul0, 0(up)
! 113: ldq ul1, 8(up)
! 114: ldq ul2, 16(up)
! 115: ldq ul3, 24(up)
! 116: lda n, -4(n)
! 117: lda up, 32(up)
! 118: bge n, L_8_or_more
! 119: L_4_to_8:
! 120: mulq vl0, ul0, m0a C U1
! 121: umulh vl0, ul0, m0b C U1
! 122: mulq vl0, ul1, m1a C U1
! 123: umulh vl0, ul1, m1b C U1
! 124: mulq vl0, ul2, m2a C U1
! 125: umulh vl0, ul2, m2b C U1
! 126: srl m0a,NAIL_BITS, r8
! 127: mulq vl0, ul3, m3a C U1
! 128: addq r8, m3b, acc0
! 129: umulh vl0, ul3, m3b C U1
! 130: srl m1a,NAIL_BITS, r8
! 131: addq r15, acc0, acc0
! 132:
! 133: addq r8, m0b, acc1
! 134: srl acc0,NUMB_BITS, r15
! 135: and acc0,numb_mask, r28
! 136: srl m2a,NAIL_BITS, r8
! 137: addq r15, acc1, acc1
! 138: bis r31, r31, r31 C nop
! 139: addq r8, m1b, acc0
! 140: srl acc1,NUMB_BITS, r15
! 141: stq r28, 0(rp)
! 142: and acc1,numb_mask, r28
! 143: srl m3a,NAIL_BITS, r8
! 144: addq r15, acc0, acc0
! 145: bis r31, r31, r31 C nop
! 146: addq r8, m2b, acc1
! 147: srl acc0,NUMB_BITS, r15
! 148: stq r28, 8(rp)
! 149: and acc0,numb_mask, r28
! 150: addq r15, acc1, acc1
! 151: bis r31, r31, r31 C nop
! 152: srl acc1,NUMB_BITS, r15
! 153: stq r28, 16(rp)
! 154: and acc1,numb_mask, r28
! 155: addq m3b, r15, acc0
! 156: stq r28, 24(rp)
! 157: and acc0,numb_mask, r0
! 158:
! 159: br r31, Lret
! 160:
! 161: L_8_or_more:
! 162: mulq vl0, ul0, m0a C U1
! 163: umulh vl0, ul0, m0b C U1
! 164: ldq ul0, 0(up)
! 165: mulq vl0, ul1, m1a C U1
! 166: umulh vl0, ul1, m1b C U1
! 167: ldq ul1, 8(up)
! 168: mulq vl0, ul2, m2a C U1
! 169: umulh vl0, ul2, m2b C U1
! 170: ldq ul2, 16(up)
! 171: srl m0a,NAIL_BITS, r8
! 172: mulq vl0, ul3, m3a C U1
! 173: addq r8, m3b, acc0
! 174: umulh vl0, ul3, m3b C U1
! 175: ldq ul3, 24(up)
! 176: srl m1a,NAIL_BITS, r8
! 177: addq r15, acc0, acc0
! 178: lda n, -4(n)
! 179: lda up, 32(up)
! 180: lda rp, 32(rp)
! 181: bge n, L_12_or_more C U0
! 182: L_8_to_11:
! 183: mulq vl0, ul0, m0a C U1
! 184: addq r8, m0b, acc1
! 185: srl acc0,NUMB_BITS, r15
! 186: umulh vl0, ul0, m0b C U1
! 187: and acc0,numb_mask, r28
! 188: srl m2a,NAIL_BITS, r8
! 189: addq r15, acc1, acc1
! 190: bis r31, r31, r31 C nop
! 191: mulq vl0, ul1, m1a C U1
! 192: addq r8, m1b, acc0
! 193: srl acc1,NUMB_BITS, r15
! 194: stq r28, -32(rp)
! 195: umulh vl0, ul1, m1b C U1
! 196: and acc1,numb_mask, r28
! 197: srl m3a,NAIL_BITS, r8
! 198: addq r15, acc0, acc0
! 199: bis r31, r31, r31 C nop
! 200: mulq vl0, ul2, m2a C U1
! 201: addq r8, m2b, acc1
! 202: srl acc0,NUMB_BITS, r15
! 203: stq r28, -24(rp)
! 204: umulh vl0, ul2, m2b C U1
! 205: and acc0,numb_mask, r28
! 206: srl m0a,NAIL_BITS, r8
! 207: addq r15, acc1, acc1
! 208: bis r31, r31, r31 C nop
! 209: mulq vl0, ul3, m3a C U1
! 210: addq r8, m3b, acc0
! 211: srl acc1,NUMB_BITS, r15
! 212: stq r28, -16(rp)
! 213: umulh vl0, ul3, m3b C U1
! 214: and acc1,numb_mask, r28
! 215: srl m1a,NAIL_BITS, r8
! 216: addq r15, acc0, acc0
! 217:
! 218: addq r8, m0b, acc1
! 219: srl acc0,NUMB_BITS, r15
! 220: stq r28, -8(rp)
! 221: and acc0,numb_mask, r28
! 222: srl m2a,NAIL_BITS, r8
! 223: addq r15, acc1, acc1
! 224: bis r31, r31, r31 C nop
! 225: addq r8, m1b, acc0
! 226: srl acc1,NUMB_BITS, r15
! 227: stq r28, 0(rp)
! 228: and acc1,numb_mask, r28
! 229: srl m3a,NAIL_BITS, r8
! 230: addq r15, acc0, acc0
! 231: bis r31, r31, r31 C nop
! 232: addq r8, m2b, acc1
! 233: srl acc0,NUMB_BITS, r15
! 234: stq r28, 8(rp)
! 235: and acc0,numb_mask, r28
! 236: addq r15, acc1, acc1
! 237: bis r31, r31, r31 C nop
! 238: srl acc1,NUMB_BITS, r15
! 239: stq r28, 16(rp)
! 240: and acc1,numb_mask, r28
! 241: addq m3b, r15, acc0
! 242: stq r28, 24(rp)
! 243: and acc0,numb_mask, r0
! 244:
! 245: br r31, Lret
! 246:
! 247: L_12_or_more:
! 248: mulq vl0, ul0, m0a C U1
! 249: addq r8, m0b, acc1
! 250: srl acc0,NUMB_BITS, r15
! 251: umulh vl0, ul0, m0b C U1
! 252: ldq ul0, 0(up)
! 253: and acc0,numb_mask, r28
! 254: srl m2a,NAIL_BITS, r8
! 255: addq r15, acc1, acc1
! 256: bis r31, r31, r31 C nop
! 257: mulq vl0, ul1, m1a C U1
! 258: addq r8, m1b, acc0
! 259: srl acc1,NUMB_BITS, r15
! 260: stq r28, -32(rp)
! 261: umulh vl0, ul1, m1b C U1
! 262: ldq ul1, 8(up)
! 263: and acc1,numb_mask, r28
! 264: srl m3a,NAIL_BITS, r8
! 265: addq r15, acc0, acc0
! 266: bis r31, r31, r31 C nop
! 267: mulq vl0, ul2, m2a C U1
! 268: addq r8, m2b, acc1
! 269: srl acc0,NUMB_BITS, r15
! 270: stq r28, -24(rp)
! 271: umulh vl0, ul2, m2b C U1
! 272: ldq ul2, 16(up)
! 273: and acc0,numb_mask, r28
! 274: srl m0a,NAIL_BITS, r8
! 275: addq r15, acc1, acc1
! 276: bis r31, r31, r31 C nop
! 277: mulq vl0, ul3, m3a C U1
! 278: addq r8, m3b, acc0
! 279: srl acc1,NUMB_BITS, r15
! 280: stq r28, -16(rp)
! 281: umulh vl0, ul3, m3b C U1
! 282: ldq ul3, 24(up)
! 283: and acc1,numb_mask, r28
! 284: srl m1a,NAIL_BITS, r8
! 285: addq r15, acc0, acc0
! 286: bis r31, r31, r31 C nop
! 287: bis r31, r31, r31 C nop
! 288: bis r31, r31, r31 C nop
! 289: bis r31, r31, r31 C nop
! 290: bis r31, r31, r31 C nop
! 291: lda n, -4(n)
! 292: lda up, 32(up)
! 293: lda rp, 32(rp)
! 294: blt n, L_end C U0
! 295:
! 296: Loop:
! 297: C
! 298: mulq vl0, ul0, m0a C U1
! 299: addq r8, m0b, acc1
! 300: srl acc0,NUMB_BITS, r15
! 301: stq r28, -40(rp)
! 302: C
! 303: umulh vl0, ul0, m0b C U1
! 304: ldq ul0, 0(up)
! 305: bis r31, r31, r31 C nop
! 306: and acc0,numb_mask, r28
! 307: C
! 308: srl m2a,NAIL_BITS, r8
! 309: bis r31, r31, r31 C nop
! 310: addq r15, acc1, acc1
! 311: bis r31, r31, r31 C nop
! 312: C
! 313: mulq vl0, ul1, m1a C U1
! 314: addq r8, m1b, acc0
! 315: srl acc1,NUMB_BITS, r15
! 316: stq r28, -32(rp)
! 317: C
! 318: umulh vl0, ul1, m1b C U1
! 319: ldq ul1, 8(up)
! 320: bis r31, r31, r31 C nop
! 321: and acc1,numb_mask, r28
! 322: C
! 323: srl m3a,NAIL_BITS, r8
! 324: bis r31, r31, r31 C nop
! 325: addq r15, acc0, acc0
! 326: bis r31, r31, r31 C nop
! 327: C
! 328: mulq vl0, ul2, m2a C U1
! 329: addq r8, m2b, acc1
! 330: srl acc0,NUMB_BITS, r15
! 331: stq r28, -24(rp)
! 332: C
! 333: umulh vl0, ul2, m2b C U1
! 334: ldq ul2, 16(up)
! 335: bis r31, r31, r31 C nop
! 336: and acc0,numb_mask, r28
! 337: C
! 338: srl m0a,NAIL_BITS, r8
! 339: bis r31, r31, r31 C nop
! 340: addq r15, acc1, acc1
! 341: bis r31, r31, r31 C nop
! 342: C
! 343: mulq vl0, ul3, m3a C U1
! 344: addq r8, m3b, acc0
! 345: srl acc1,NUMB_BITS, r15
! 346: stq r28, -16(rp)
! 347: C
! 348: umulh vl0, ul3, m3b C U1
! 349: ldq ul3, 24(up)
! 350: bis r31, r31, r31 C nop
! 351: and acc1,numb_mask, r28
! 352: C
! 353: srl m1a,NAIL_BITS, r8
! 354: bis r31, r31, r31 C nop
! 355: addq r15, acc0, acc0
! 356: bis r31, r31, r31 C nop
! 357: C
! 358: lda n, -4(n)
! 359: lda up, 32(up)
! 360: lda rp, 32(rp)
! 361: bge n, Loop C U0
! 362:
! 363: L_end:
! 364: mulq vl0, ul0, m0a C U1
! 365: addq r8, m0b, acc1
! 366: srl acc0,NUMB_BITS, r15
! 367: stq r28, -40(rp)
! 368: umulh vl0, ul0, m0b C U1
! 369: and acc0,numb_mask, r28
! 370: srl m2a,NAIL_BITS, r8
! 371: addq r15, acc1, acc1
! 372: bis r31, r31, r31 C nop
! 373: mulq vl0, ul1, m1a C U1
! 374: addq r8, m1b, acc0
! 375: srl acc1,NUMB_BITS, r15
! 376: stq r28, -32(rp)
! 377: umulh vl0, ul1, m1b C U1
! 378: and acc1,numb_mask, r28
! 379: srl m3a,NAIL_BITS, r8
! 380: addq r15, acc0, acc0
! 381: bis r31, r31, r31 C nop
! 382: mulq vl0, ul2, m2a C U1
! 383: addq r8, m2b, acc1
! 384: srl acc0,NUMB_BITS, r15
! 385: stq r28, -24(rp)
! 386: umulh vl0, ul2, m2b C U1
! 387: and acc0,numb_mask, r28
! 388: srl m0a,NAIL_BITS, r8
! 389: addq r15, acc1, acc1
! 390: bis r31, r31, r31 C nop
! 391: mulq vl0, ul3, m3a C U1
! 392: addq r8, m3b, acc0
! 393: srl acc1,NUMB_BITS, r15
! 394: stq r28, -16(rp)
! 395: umulh vl0, ul3, m3b C U1
! 396: and acc1,numb_mask, r28
! 397: srl m1a,NAIL_BITS, r8
! 398: addq r15, acc0, acc0
! 399: bis r31, r31, r31 C nop
! 400: bis r31, r31, r31 C nop
! 401: bis r31, r31, r31 C nop
! 402: bis r31, r31, r31 C nop
! 403: bis r31, r31, r31 C nop
! 404: lda rp, 32(rp)
! 405:
! 406: addq r8, m0b, acc1
! 407: srl acc0,NUMB_BITS, r15
! 408: stq r28, -40(rp)
! 409: and acc0,numb_mask, r28
! 410: srl m2a,NAIL_BITS, r8
! 411: addq r15, acc1, acc1
! 412: bis r31, r31, r31 C nop
! 413: addq r8, m1b, acc0
! 414: srl acc1,NUMB_BITS, r15
! 415: stq r28, -32(rp)
! 416: and acc1,numb_mask, r28
! 417: srl m3a,NAIL_BITS, r8
! 418: addq r15, acc0, acc0
! 419: bis r31, r31, r31 C nop
! 420: addq r8, m2b, acc1
! 421: srl acc0,NUMB_BITS, r15
! 422: stq r28, -24(rp)
! 423: and acc0,numb_mask, r28
! 424: addq r15, acc1, acc1
! 425: bis r31, r31, r31 C nop
! 426: srl acc1,NUMB_BITS, r15
! 427: stq r28, -16(rp)
! 428: and acc1,numb_mask, r28
! 429: addq m3b, r15, acc0
! 430: stq r28, -8(rp)
! 431: and acc0,numb_mask, r0
! 432: Lret:
! 433: C ldq r9, 8(r30)
! 434: C ldq r10, 16(r30)
! 435: C ldq r11, 24(r30)
! 436: C ldq r12, 32(r30)
! 437: C ldq r13, 40(r30)
! 438: C ldq r14, 48(r30)
! 439: ldq r15, 56(r30)
! 440: lda r30, 240(r30)
! 441: ret r31, (r26), 1
! 442: EPILOGUE(mpn_mul_1)
! 443: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>