Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/add_n.asm, Revision 1.1
1.1 ! ohara 1: dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
! 2: dnl store sum in a third limb vector.
! 3:
! 4: dnl Copyright 2000 Free Software Foundation, Inc.
! 5:
! 6: dnl This file is part of the GNU MP Library.
! 7:
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 9: dnl it under the terms of the GNU Lesser General Public License as published
! 10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 11: dnl your option) any later version.
! 12:
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: dnl License for more details.
! 17:
! 18: dnl You should have received a copy of the GNU Lesser General Public License
! 19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: dnl MA 02111-1307, USA.
! 22:
! 23: include(`../config.m4')
! 24:
! 25: dnl INPUT PARAMETERS
! 26: dnl res_ptr r16
! 27: dnl s1_ptr r17
! 28: dnl s2_ptr r18
! 29: dnl size r19
! 30:
! 31: dnl This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
! 32:
! 33: dnl This code was written in close cooperation with ev6 pipeline expert
! 34: dnl Steve Root. Any errors are tege's fault, though.
! 35:
! 36: dnl work triplet 0-2
! 37: dnl work triplet 3-5
! 38: dnl work triplet 6-8
! 39: dnl work triplet 9-11
! 40: dnl carry's 20-23
! 41:
! 42: dnl sustains 8 adds in 17 cycles !
! 43: dnl (from the d_cache)
! 44:
! 45: dnl pair loads and stores where possible
! 46: dnl store pairs oct-aligned where possible
! 47: dnl (didn't need it here)
! 48: dnl stores are delayed every third cycle
! 49: dnl loads and stores are delayed by fills
! 50: dnl U stays still, put code there where possible
! 51: dnl (note alternation of U1 and U0)
! 52: dnl L moves because of loads and stores
! 53: dnl note dampers in L to limit damage
! 54: dnl note, load ahead of time where possible
! 55:
! 56: dnl this odd-looking optimization expects
! 57: dnl that were having random bits in our data, so
! 58: dnl that a pure zero result is unlikely. so we
! 59: dnl penalize the unlikely case to help the
! 60: dnl common case.
! 61:
! 62: ASM_START()
! 63: PROLOGUE(mpn_add_n)
! 64: lda r30, -240(r30)
! 65: stq r9, 8(r30)
! 66: stq r10, 16(r30)
! 67: stq r11, 24(r30)
! 68:
! 69: lda r19, -8(r19) C L1 move counter
! 70:
! 71: bis r31, r31, r23
! 72: blt r19, $Lsmall
! 73:
! 74: ldq r0, 0(r17) C L0 get next ones
! 75: ldq r1, 0(r18) C L1
! 76: ldq r3, 8(r17) C L0 get next ones
! 77: ldq r4, 8(r18) C L1
! 78: ldq r6, 16(r17) C L0 get next ones
! 79: ldq r7, 16(r18) C L1
! 80:
! 81: ldq r9, 24(r17) C L0 get next ones
! 82: ldq r10, 24(r18) C L1
! 83:
! 84: addq r0, r1, r2 C U1 add two data
! 85:
! 86: cmpult r2, r1, r20 C U1 did it carry
! 87:
! 88: ldq r0, 32(r17) C L0 get next ones
! 89: ldq r1, 32(r18) C L1
! 90:
! 91: addq r3, r4, r5 C U0 add two data
! 92:
! 93: cmpult r5, r4, r21 C U0 did it carry
! 94: ldq r3, 40(r17) C L0 get next ones
! 95: ldq r4, 40(r18) C L1
! 96:
! 97: addq r6, r7, r8 C U1 add two data
! 98: addq r5, r20, r5 C U0 carry from last
! 99: stq r2, 0(r16) C L1
! 100:
! 101: cmpult r8, r7, r22 C U1 did it carry
! 102: beq r5, $fix5w C U0 fix exact zero
! 103: $ret5w: ldq r6, 48(r17) C L0 get next ones
! 104: ldq r7, 48(r18) C L1
! 105:
! 106: bis r31, r31, r31 C L damp out
! 107: addq r8, r21, r8 C U1 carry from last
! 108: bis r31, r31, r31 C L moves in L !
! 109: addq r9, r10, r11 C U0 add two data
! 110:
! 111: beq r8, $fix6w C U1 fix exact zero
! 112: $ret6w: cmpult r11, r10, r23 C U0 did it carry
! 113: ldq r9, 56(r17) C L0 get next ones
! 114: ldq r10, 56(r18) C L1
! 115:
! 116: lda r17, 64(r17) C L0 move pointer
! 117: bis r31, r31, r31 C U
! 118: lda r18, 64(r18) C L1 move pointer
! 119:
! 120: lda r19, -8(r19) C L1 move counter
! 121: blt r19, $Lend
! 122:
! 123: C Main loop. 8-way unrolled.
! 124: ALIGN(8)
! 125: $Loop:
! 126: addq r0, r1, r2 C U1 add two data
! 127: addq r11, r22, r11 C U0 add in carry
! 128: stq r5, 8(r16) C L0 put an answer
! 129: stq r8, 16(r16) C L1 pair
! 130:
! 131: cmpult r2, r1, r20 C U1 did it carry
! 132: beq r11, $fix7 C U0 fix exact 0
! 133: $ret7: ldq r0, 0(r17) C L0 get next ones
! 134: ldq r1, 0(r18) C L1
! 135:
! 136: bis r31, r31, r31 C L damp out
! 137: addq r2, r23, r2 C U1 carry from last
! 138: bis r31, r31, r31 C L moves in L !
! 139: addq r3, r4, r5 C U0 add two data
! 140:
! 141: beq r2, $fix0 C U1 fix exact zero
! 142: $ret0: cmpult r5, r4, r21 C U0 did it carry
! 143: ldq r3, 8(r17) C L0 get next ones
! 144: ldq r4, 8(r18) C L1
! 145:
! 146: addq r6, r7, r8 C U1 add two data
! 147: addq r5, r20, r5 C U0 carry from last
! 148: stq r11, 24(r16) C L0 store pair
! 149: stq r2, 32(r16) C L1
! 150:
! 151: cmpult r8, r7, r22 C U1 did it carry
! 152: beq r5, $fix1 C U0 fix exact zero
! 153: $ret1: ldq r6, 16(r17) C L0 get next ones
! 154: ldq r7, 16(r18) C L1
! 155:
! 156: lda r16, 64(r16) C L0 move pointer
! 157: addq r8, r21, r8 C U1 carry from last
! 158: lda r19, -8(r19) C L1 move counter
! 159: addq r9, r10, r11 C U0 add two data
! 160:
! 161: beq r8, $fix2 C U1 fix exact zero
! 162: $ret2: cmpult r11, r10, r23 C U0 did it carry
! 163: ldq r9, 24(r17) C L0 get next ones
! 164: ldq r10, 24(r18) C L1
! 165:
! 166: addq r0, r1, r2 C U1 add two data
! 167: addq r11, r22, r11 C U0 add in carry
! 168: stq r5, -24(r16) C L0 put an answer
! 169: stq r8, -16(r16) C L1 pair
! 170:
! 171: cmpult r2, r1, r20 C U1 did it carry
! 172: beq r11, $fix3 C U0 fix exact 0
! 173: $ret3: ldq r0, 32(r17) C L0 get next ones
! 174: ldq r1, 32(r18) C L1
! 175:
! 176: bis r31, r31, r31 C L damp out
! 177: addq r2, r23, r2 C U1 carry from last
! 178: bis r31, r31, r31 C L moves in L !
! 179: addq r3, r4, r5 C U0 add two data
! 180:
! 181: beq r2, $fix4 C U1 fix exact zero
! 182: $ret4: cmpult r5, r4, r21 C U0 did it carry
! 183: ldq r3, 40(r17) C L0 get next ones
! 184: ldq r4, 40(r18) C L1
! 185:
! 186: addq r6, r7, r8 C U1 add two data
! 187: addq r5, r20, r5 C U0 carry from last
! 188: stq r11, -8(r16) C L0 store pair
! 189: stq r2, 0(r16) C L1
! 190:
! 191: cmpult r8, r7, r22 C U1 did it carry
! 192: beq r5, $fix5 C U0 fix exact zero
! 193: $ret5: ldq r6, 48(r17) C L0 get next ones
! 194: ldq r7, 48(r18) C L1
! 195:
! 196: bis r31, r31, r31 C L damp out
! 197: addq r8, r21, r8 C U1 carry from last
! 198: bis r31, r31, r31 C L moves in L !
! 199: addq r9, r10, r11 C U0 add two data
! 200:
! 201: beq r8, $fix6 C U1 fix exact zero
! 202: $ret6: cmpult r11, r10, r23 C U0 did it carry
! 203: ldq r9, 56(r17) C L0 get next ones
! 204: ldq r10, 56(r18) C L1
! 205:
! 206: lda r17, 64(r17) C L0 move pointer
! 207: bis r31, r31, r31 C U
! 208: lda r18, 64(r18) C L1 move pointer
! 209: bge r19, $Loop C U1 loop control
! 210: C ==== main loop end
! 211:
! 212: $Lend:
! 213: addq r0, r1, r2 C U1 add two data
! 214: addq r11, r22, r11 C U0 add in carry
! 215: stq r5, 8(r16) C L0 put an answer
! 216: stq r8, 16(r16) C L1 pair
! 217:
! 218: cmpult r2, r1, r20 C U1 did it carry
! 219: beq r11, $fix7c C U0 fix exact 0
! 220: $ret7c:
! 221: addq r2, r23, r2 C U1 carry from last
! 222: addq r3, r4, r5 C U0 add two data
! 223:
! 224: beq r2, $fix0c C U1 fix exact zero
! 225: $ret0c: cmpult r5, r4, r21 C U0 did it carry
! 226:
! 227: addq r6, r7, r8 C U1 add two data
! 228: addq r5, r20, r5 C U0 carry from last
! 229: stq r11, 24(r16) C L0 store pair
! 230: stq r2, 32(r16) C L1
! 231:
! 232: cmpult r8, r7, r22 C U1 did it carry
! 233: beq r5, $fix1c C U0 fix exact zero
! 234: $ret1c:
! 235: lda r16, 64(r16) C L0 move pointer
! 236: addq r8, r21, r8 C U1 carry from last
! 237: addq r9, r10, r11 C U0 add two data
! 238:
! 239: beq r8, $fix2c C U1 fix exact zero
! 240: $ret2c: cmpult r11, r10, r23 C U0 did it carry
! 241:
! 242: addq r11, r22, r11 C U0 add in carry
! 243: stq r5, -24(r16) C L0 put an answer
! 244: stq r8, -16(r16) C L1 pair
! 245:
! 246: beq r11, $fix3c C U0 fix exact 0
! 247: $ret3c:
! 248: stq r11, -8(r16) C L0 store pair
! 249:
! 250:
! 251: $Lsmall:
! 252: lda r19, 8(r19)
! 253: beq r19, $Lret
! 254:
! 255: ldq r0, 0(r17)
! 256: ldq r1, 0(r18)
! 257: lda r19, -1(r19)
! 258: beq r19, $Lend0
! 259:
! 260: ALIGN(8)
! 261: $Loop0: addq r0, r1, r2 C main add
! 262: ldq r0, 8(r17)
! 263: cmpult r2, r1, r8 C compute cy from last add
! 264: ldq r1, 8(r18)
! 265: addq r2, r23, r20 C carry add
! 266: lda r17, 8(r17)
! 267: lda r18, 8(r18)
! 268: stq r20, 0(r16)
! 269: cmpult r20, r2, r23 C compute cy from last add
! 270: lda r19, -1(r19) C decr loop cnt
! 271: bis r8, r23, r23 C combine cy from the two adds
! 272: lda r16, 8(r16)
! 273: bne r19, $Loop0
! 274: $Lend0: addq r0, r1, r2 C main add
! 275: addq r2, r23, r20 C carry add
! 276: cmpult r2, r1, r8 C compute cy from last add
! 277: cmpult r20, r2, r23 C compute cy from last add
! 278: stq r20, 0(r16)
! 279: bis r8, r23, r23 C combine cy from the two adds
! 280:
! 281: $Lret:
! 282: lda r0, 0(r23) C copy carry into return register
! 283:
! 284: ldq r9, 8(r30)
! 285: ldq r10, 16(r30)
! 286: ldq r11, 24(r30)
! 287: lda r30, 240(r30)
! 288: ret r31,(r26),1
! 289:
! 290:
! 291: $fix5w: bis r21, r20, r21 C bring forward carry
! 292: br r31, $ret5w
! 293: $fix6w: bis r22, r21, r22 C bring forward carry
! 294: br r31, $ret6w
! 295: $fix0: bis r20, r23, r20 C bring forward carry
! 296: br r31, $ret0
! 297: $fix1: bis r21, r20, r21 C bring forward carry
! 298: br r31, $ret1
! 299: $fix2: bis r22, r21, r22 C bring forward carry
! 300: br r31, $ret2
! 301: $fix3: bis r23, r22, r23 C bring forward carry
! 302: br r31, $ret3
! 303: $fix4: bis r20, r23, r20 C bring forward carry
! 304: br r31, $ret4
! 305: $fix5: bis r20, r21, r21 C bring forward carry
! 306: br r31, $ret5
! 307: $fix6: bis r22, r21, r22 C bring forward carry
! 308: br r31, $ret6
! 309: $fix7: bis r23, r22, r23 C bring forward carry
! 310: br r31, $ret7
! 311: $fix0c: bis r20, r23, r20 C bring forward carry
! 312: br r31, $ret0c
! 313: $fix1c: bis r21, r20, r21 C bring forward carry
! 314: br r31, $ret1c
! 315: $fix2c: bis r22, r21, r22 C bring forward carry
! 316: br r31, $ret2c
! 317: $fix3c: bis r23, r22, r23 C bring forward carry
! 318: br r31, $ret3c
! 319: $fix7c: bis r23, r22, r23 C bring forward carry
! 320: br r31, $ret7c
! 321:
! 322: EPILOGUE(mpn_add_n)
! 323: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>