Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_4.asm, Revision 1.1
1.1 ! ohara 1: dnl Alpha ev6 nails mpn_addmul_4.
! 2:
! 3: dnl Copyright 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24: dnl INPUT PARAMETERS
! 25: define(`rp',`r16')
! 26: define(`up',`r17')
! 27: define(`n',`r18')
! 28: define(`vp',`r19')
! 29:
! 30: dnl Useful register aliases
! 31: define(`numb_mask',`r24')
! 32: define(`ulimb',`r25')
! 33: define(`rlimb',`r27')
! 34:
! 35: define(`m0a',`r0')
! 36: define(`m0b',`r1')
! 37: define(`m1a',`r2')
! 38: define(`m1b',`r3')
! 39: define(`m2a',`r20')
! 40: define(`m2b',`r21')
! 41: define(`m3a',`r12')
! 42: define(`m3b',`r13')
! 43:
! 44: define(`acc0',`r4')
! 45: define(`acc1',`r5')
! 46: define(`acc2',`r22')
! 47: define(`acc3',`r14')
! 48:
! 49: define(`v0',`r6')
! 50: define(`v1',`r7')
! 51: define(`v2',`r23')
! 52: define(`v3',`r15')
! 53:
! 54: dnl Used for temps: r8 r19 r28
! 55:
! 56: define(`NAIL_BITS',`GMP_NAIL_BITS')
! 57: define(`NUMB_BITS',`GMP_NUMB_BITS')
! 58:
! 59: dnl This declaration is munged by configure
! 60: NAILS_SUPPORT(4-63)
! 61:
! 62: dnl Runs at 2.5 cycles/limb. With unrolling, the ulimb load and the 3
! 63: dnl bookkeeping increments and the `bis' that copies from r23 to r7 could be
! 64: dnl removed and the instruction count reduced from 31 to to 26. We could
! 65: dnl thereby surely reach 2 cycles/limb, the IMUL bandwidth.
! 66:
! 67: dnl If this is going to be a Karatsuba basecase building block, we need some
! 68: dnl of the combinations below. That way, we won't ever hit the
! 69: dnl slower mpn_addmul_1 for any huge multiplication.
! 70: dnl
! 71: dnl Alt 3 Alt 4 Alt 5 Alt 6
! 72: dnl addmul_2 addmul_2 addmul_3 addmul_3
! 73: dnl addmul_3 addmul_3 addmul_4 addmul_4
! 74: dnl addmul_4 addmul_5 addmul_5
! 75: dnl addmul_6
! 76:
! 77: dnl Register usage:
! 78: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
! 79: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
! 80: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
! 81: dnl return address: 26
! 82: dnl global pointer: 29
! 83: dnl stack pointer: 30
! 84:
! 85: ASM_START()
! 86: PROLOGUE(mpn_addmul_4)
! 87: lda r30, -240(r30)
! 88: stq r12, 32(r30)
! 89: stq r13, 40(r30)
! 90: stq r14, 48(r30)
! 91: stq r15, 56(r30)
! 92:
! 93: lda numb_mask,-1(r31)
! 94: srl numb_mask,NAIL_BITS,numb_mask
! 95:
! 96: ldq v0, 0(vp)
! 97: ldq v1, 8(vp)
! 98: ldq v2, 16(vp)
! 99: ldq v3, 24(vp)
! 100:
! 101: bis r31, r31, acc0 C zero acc0
! 102: sll v0,NAIL_BITS, v0
! 103: bis r31, r31, acc1 C zero acc1
! 104: sll v1,NAIL_BITS, v1
! 105: bis r31, r31, acc2 C zero acc2
! 106: sll v2,NAIL_BITS, v2
! 107: bis r31, r31, acc3 C zero acc3
! 108: sll v3,NAIL_BITS, v3
! 109: bis r31, r31, r19
! 110:
! 111: C MAIN LOOP
! 112: ldq ulimb, 0(up)
! 113: lda up, 8(up)
! 114: mulq v0, ulimb, m0a C U1
! 115: umulh v0, ulimb, m0b C U1
! 116: mulq v1, ulimb, m1a C U1
! 117: umulh v1, ulimb, m1b C U1
! 118: lda n, -1(n)
! 119: mulq v2, ulimb, m2a C U1
! 120: umulh v2, ulimb, m2b C U1
! 121: mulq v3, ulimb, m3a C U1
! 122: umulh v3, ulimb, m3b C U1
! 123: beq n, Lend C U0
! 124: ALIGN(16)
! 125: Loop:
! 126: bis r31, r31, r31 C nop
! 127: ldq rlimb, 0(rp)
! 128: ldq ulimb, 0(up)
! 129: addq r19, acc0, acc0 C propagate nail
! 130:
! 131: lda rp, 8(rp)
! 132: srl m0a,NAIL_BITS, r8 C U0
! 133: lda up, 8(up)
! 134: mulq v0, ulimb, m0a C U1
! 135:
! 136: addq r8, acc0, r19
! 137: addq m0b, acc1, acc0
! 138: umulh v0, ulimb, m0b C U1
! 139: bis r31, r31, r31 C nop
! 140:
! 141: addq rlimb, r19, r19
! 142: srl m1a,NAIL_BITS, r8 C U0
! 143: bis r31, r31, r31 C nop
! 144: mulq v1, ulimb, m1a C U1
! 145:
! 146: addq r8, acc0, acc0
! 147: addq m1b, acc2, acc1
! 148: umulh v1, ulimb, m1b C U1
! 149: and r19,numb_mask, r28 C extract numb part
! 150:
! 151: bis r31, r31, r31 C nop
! 152: srl m2a,NAIL_BITS, r8 C U0
! 153: lda n, -1(n)
! 154: mulq v2, ulimb, m2a C U1
! 155:
! 156: addq r8, acc1, acc1
! 157: addq m2b, acc3, acc2
! 158: umulh v2, ulimb, m2b C U1
! 159: srl r19,NUMB_BITS, r19 C extract nail part
! 160:
! 161: bis r31, r31, r31 C nop
! 162: srl m3a,NAIL_BITS, r8 C U0
! 163: stq r28, -8(rp)
! 164: mulq v3, ulimb, m3a C U1
! 165:
! 166: addq r8, acc2, acc2
! 167: bis r31, m3b, acc3
! 168: umulh v3, ulimb, m3b C U1
! 169: bne n, Loop C U0
! 170: C END LOOP
! 171: Lend:
! 172: ldq rlimb, 0(rp)
! 173: addq r19, acc0, acc0 C propagate nail
! 174: lda rp, 8(rp)
! 175: srl m0a,NAIL_BITS, r8 C U0
! 176: addq r8, acc0, r19
! 177: addq m0b, acc1, acc0
! 178: addq rlimb, r19, r19
! 179: srl m1a,NAIL_BITS, r8 C U0
! 180: addq r8, acc0, acc0
! 181: addq m1b, acc2, acc1
! 182: and r19,numb_mask, r28 C extract limb
! 183: srl m2a,NAIL_BITS, r8 C U0
! 184: addq r8, acc1, acc1
! 185: addq m2b, acc3, acc2
! 186: srl r19,NUMB_BITS, r19 C extract nail
! 187: srl m3a,NAIL_BITS, r8 C U0
! 188: stq r28, -8(rp)
! 189: addq r8, acc2, acc2
! 190: bis r31, m3b, acc3
! 191:
! 192: addq r19, acc0, acc0 C propagate nail
! 193: and acc0,numb_mask, r28
! 194: stq r28, 0(rp)
! 195: srl acc0,NUMB_BITS, r19
! 196: addq r19, acc1, acc1
! 197:
! 198: and acc1,numb_mask, r28
! 199: stq r28, 8(rp)
! 200: srl acc1,NUMB_BITS, r19
! 201: addq r19, acc2, acc2
! 202:
! 203: and acc2,numb_mask, r28
! 204: stq r28, 16(rp)
! 205: srl acc2,NUMB_BITS, r19
! 206: addq r19, acc3, r0
! 207:
! 208: ldq r12, 32(r30)
! 209: ldq r13, 40(r30)
! 210: ldq r14, 48(r30)
! 211: ldq r15, 56(r30)
! 212: lda r30, 240(r30)
! 213: ret r31, (r26), 1
! 214: EPILOGUE(mpn_addmul_4)
! 215: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>