Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/submul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
! 2: dnl the result from a second limb vector.
! 3:
! 4: dnl Copyright 2000 Free Software Foundation, Inc.
! 5:
! 6: dnl This file is part of the GNU MP Library.
! 7:
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 9: dnl it under the terms of the GNU Lesser General Public License as published
! 10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 11: dnl your option) any later version.
! 12:
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: dnl License for more details.
! 17:
! 18: dnl You should have received a copy of the GNU Lesser General Public License
! 19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: dnl MA 02111-1307, USA.
! 22:
! 23: include(`../config.m4')
! 24:
! 25: dnl INPUT PARAMETERS
! 26: dnl res_ptr r16
! 27: dnl s1_ptr r17
! 28: dnl size r18
! 29: dnl s2_limb r19
! 30:
! 31: dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
! 32: dnl exactly 3.5 cycles/limb on EV6...
! 33:
! 34: dnl This code was written in close cooperation with ev6 pipeline expert
! 35: dnl Steve Root. Any errors are tege's fault, though.
! 36: dnl
! 37: dnl Register usages for unrolled loop:
! 38: dnl 0-3 mul's
! 39: dnl 4-7 acc's
! 40: dnl 8-15 mul results
! 41: dnl 20,21 carry's
! 42: dnl 22,23 save for stores
! 43:
! 44: dnl Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
! 45:
! 46: dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
! 47: dnl them, so that further disturbance to the schedule is damped.
! 48:
! 49: dnl We couldn't pair the loads, because the entangled schedule of the
! 50: dnl carry's has to happen on one side {0} of the machine. Note, the total
! 51: dnl use of U0, and the total use of L0 (after attending to the stores).
! 52: dnl which is part of the reason why....
! 53:
! 54: dnl This is a great schedule for the d_cache, a poor schedule for the
! 55: dnl b_cache. The lockup on U0 means that any stall can't be recovered
! 56: dnl from. Consider a ldq in L1. say that load gets stalled because it
! 57: dnl collides with a fill from the b_Cache. On the next cycle, this load
! 58: dnl gets priority. If first looks at L0, and goes there. The instruction
! 59: dnl we intended for L0 gets to look at L1, which is NOT where we want
! 60: dnl it. It either stalls 1, because it can't go in L0, or goes there, and
! 61: dnl causes a further instruction to stall.
! 62:
! 63: dnl So for b_cache, we're likely going to want to put one or more cycles
! 64: dnl back into the code! And, of course, put in prefetches. For the
! 65: dnl accumulator, lds, intent to modify. For the multiplier, you might
! 66: dnl want ldq, evict next, if you're not wanting to use it again soon. Use
! 67: dnl 256 ahead of present pointer value. At a place where we have an mt
! 68: dnl followed by a bookkeeping, put the bookkeeping in upper, and the
! 69: dnl prefetch into lower.
! 70:
! 71: dnl Note, the usage of physical registers per cycle is smoothed off, as
! 72: dnl much as possible.
! 73:
! 74: dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
! 75: dnl like not to have a ldq or stq to preceded a conditional branch in a
! 76: dnl quadpack. The conditional branch moves the retire pointer one cycle
! 77: dnl later.
! 78:
! 79: dnl Optimization notes:
! 80: dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
! 81: dnl Reserved regs: r29 r30 r31
! 82: dnl Free caller-saves regs in unrolled code: r24 r25 r28
! 83: dnl We should swap some of the callee-saves regs for some of the free
! 84: dnl caller-saves regs, saving some overhead cycles.
! 85: dnl Most importantly, we should write fast code for the 0-7 case.
! 86: dnl The code we use there are for the 21164, and runs at 7 cycles/limb
! 87: dnl on the 21264. Should not be hard, if we write specialized code for
! 88: dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
! 89: dnl need a jump table indexed by the low 3 bits of the count argument.
! 90:
! 91:
! 92: ASM_START()
! 93: PROLOGUE(mpn_submul_1)
! 94: cmpult r18, 8, r1
! 95: beq r1, $Large
! 96:
! 97: ldq r2, 0(r17) C r2 = s1_limb
! 98: addq r17, 8, r17 C s1_ptr++
! 99: subq r18, 1, r18 C size--
! 100: mulq r2, r19, r3 C r3 = prod_low
! 101: ldq r5, 0(r16) C r5 = *res_ptr
! 102: umulh r2, r19, r0 C r0 = prod_high
! 103: beq r18, $Lend0b C jump if size was == 1
! 104: ldq r2, 0(r17) C r2 = s1_limb
! 105: addq r17, 8, r17 C s1_ptr++
! 106: subq r18, 1, r18 C size--
! 107: subq r5, r3, r3
! 108: cmpult r5, r3, r4
! 109: stq r3, 0(r16)
! 110: addq r16, 8, r16 C res_ptr++
! 111: beq r18, $Lend0a C jump if size was == 2
! 112:
! 113: ALIGN(8)
! 114: $Loop0: mulq r2, r19, r3 C r3 = prod_low
! 115: ldq r5, 0(r16) C r5 = *res_ptr
! 116: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
! 117: subq r18, 1, r18 C size--
! 118: umulh r2, r19, r4 C r4 = cy_limb
! 119: ldq r2, 0(r17) C r2 = s1_limb
! 120: addq r17, 8, r17 C s1_ptr++
! 121: addq r3, r0, r3 C r3 = cy_limb + prod_low
! 122: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
! 123: subq r5, r3, r3
! 124: cmpult r5, r3, r5
! 125: stq r3, 0(r16)
! 126: addq r16, 8, r16 C res_ptr++
! 127: addq r5, r0, r0 C combine carries
! 128: bne r18, $Loop0
! 129: $Lend0a:
! 130: mulq r2, r19, r3 C r3 = prod_low
! 131: ldq r5, 0(r16) C r5 = *res_ptr
! 132: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
! 133: umulh r2, r19, r4 C r4 = cy_limb
! 134: addq r3, r0, r3 C r3 = cy_limb + prod_low
! 135: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
! 136: subq r5, r3, r3
! 137: cmpult r5, r3, r5
! 138: stq r3, 0(r16)
! 139: addq r5, r0, r0 C combine carries
! 140: addq r4, r0, r0 C cy_limb = prod_high + cy
! 141: ret r31, (r26), 1
! 142: $Lend0b:
! 143: subq r5, r3, r3
! 144: cmpult r5, r3, r5
! 145: stq r3, 0(r16)
! 146: addq r0, r5, r0
! 147: ret r31, (r26), 1
! 148:
! 149: $Large:
! 150: lda $30, -240($30)
! 151: stq $9, 8($30)
! 152: stq $10, 16($30)
! 153: stq $11, 24($30)
! 154: stq $12, 32($30)
! 155: stq $13, 40($30)
! 156: stq $14, 48($30)
! 157: stq $15, 56($30)
! 158:
! 159: and r18, 7, r20 C count for the first loop, 0-7
! 160: srl r18, 3, r18 C count for unrolled loop
! 161: bis r31, r31, r0
! 162: beq r20, $Lunroll
! 163: ldq r2, 0(r17) C r2 = s1_limb
! 164: addq r17, 8, r17 C s1_ptr++
! 165: subq r20, 1, r20 C size--
! 166: mulq r2, r19, r3 C r3 = prod_low
! 167: ldq r5, 0(r16) C r5 = *res_ptr
! 168: umulh r2, r19, r0 C r0 = prod_high
! 169: beq r20, $Lend1b C jump if size was == 1
! 170: ldq r2, 0(r17) C r2 = s1_limb
! 171: addq r17, 8, r17 C s1_ptr++
! 172: subq r20, 1, r20 C size--
! 173: subq r5, r3, r3
! 174: cmpult r5, r3, r4
! 175: stq r3, 0(r16)
! 176: addq r16, 8, r16 C res_ptr++
! 177: beq r20, $Lend1a C jump if size was == 2
! 178:
! 179: ALIGN(8)
! 180: $Loop1: mulq r2, r19, r3 C r3 = prod_low
! 181: ldq r5, 0(r16) C r5 = *res_ptr
! 182: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
! 183: subq r20, 1, r20 C size--
! 184: umulh r2, r19, r4 C r4 = cy_limb
! 185: ldq r2, 0(r17) C r2 = s1_limb
! 186: addq r17, 8, r17 C s1_ptr++
! 187: addq r3, r0, r3 C r3 = cy_limb + prod_low
! 188: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
! 189: subq r5, r3, r3
! 190: cmpult r5, r3, r5
! 191: stq r3, 0(r16)
! 192: addq r16, 8, r16 C res_ptr++
! 193: addq r5, r0, r0 C combine carries
! 194: bne r20, $Loop1
! 195:
! 196: $Lend1a:
! 197: mulq r2, r19, r3 C r3 = prod_low
! 198: ldq r5, 0(r16) C r5 = *res_ptr
! 199: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
! 200: umulh r2, r19, r4 C r4 = cy_limb
! 201: addq r3, r0, r3 C r3 = cy_limb + prod_low
! 202: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
! 203: subq r5, r3, r3
! 204: cmpult r5, r3, r5
! 205: stq r3, 0(r16)
! 206: addq r16, 8, r16 C res_ptr++
! 207: addq r5, r0, r0 C combine carries
! 208: addq r4, r0, r0 C cy_limb = prod_high + cy
! 209: br r31, $Lunroll
! 210: $Lend1b:
! 211: subq r5, r3, r3
! 212: cmpult r5, r3, r5
! 213: stq r3, 0(r16)
! 214: addq r16, 8, r16 C res_ptr++
! 215: addq r0, r5, r0
! 216:
! 217: $Lunroll:
! 218: lda r17, -16(r17) C L1 bookkeeping
! 219: lda r16, -16(r16) C L1 bookkeeping
! 220: bis r0, r31, r12
! 221:
! 222: C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
! 223:
! 224: ldq r2, 16(r17) C L1
! 225: ldq r3, 24(r17) C L1
! 226: lda r18, -1(r18) C L1 bookkeeping
! 227: ldq r6, 16(r16) C L1
! 228: ldq r7, 24(r16) C L1
! 229: ldq r0, 32(r17) C L1
! 230: mulq r19, r2, r13 C U1
! 231: ldq r1, 40(r17) C L1
! 232: umulh r19, r2, r14 C U1
! 233: mulq r19, r3, r15 C U1
! 234: lda r17, 64(r17) C L1 bookkeeping
! 235: ldq r4, 32(r16) C L1
! 236: ldq r5, 40(r16) C L1
! 237: umulh r19, r3, r8 C U1
! 238: ldq r2, -16(r17) C L1
! 239: mulq r19, r0, r9 C U1
! 240: ldq r3, -8(r17) C L1
! 241: umulh r19, r0, r10 C U1
! 242: subq r6, r13, r13 C L0 lo + acc
! 243: mulq r19, r1, r11 C U1
! 244: cmpult r6, r13, r20 C L0 lo add => carry
! 245: lda r16, 64(r16) C L1 bookkeeping
! 246: subq r13, r12, r22 C U0 hi add => answer
! 247: cmpult r13, r12, r21 C L0 hi add => carry
! 248: addq r14, r20, r14 C U0 hi mul + carry
! 249: ldq r6, -16(r16) C L1
! 250: subq r7, r15, r28 C L0 lo + acc
! 251: addq r14, r21, r14 C U0 hi mul + carry
! 252: cmpult r7, r15, r20 C L0 lo add => carry
! 253: ldq r7, -8(r16) C L1
! 254: umulh r19, r1, r12 C U1
! 255: subq r28, r14, r23 C U0 hi add => answer
! 256: ldq r0, 0(r17) C L1
! 257: mulq r19, r2, r13 C U1
! 258: cmpult r28, r14, r21 C L0 hi add => carry
! 259: addq r8, r20, r8 C U0 hi mul + carry
! 260: ldq r1, 8(r17) C L1
! 261: umulh r19, r2, r14 C U1
! 262: subq r4, r9, r9 C L0 lo + acc
! 263: stq r22, -48(r16) C L0
! 264: stq r23, -40(r16) C L1
! 265: mulq r19, r3, r15 C U1
! 266: addq r8, r21, r8 C U0 hi mul + carry
! 267: cmpult r4, r9, r20 C L0 lo add => carry
! 268: subq r9, r8, r22 C U0 hi add => answer
! 269: ble r18, $Lend C U1 bookkeeping
! 270:
! 271: C ____ MAIN UNROLLED LOOP ____
! 272: ALIGN(16)
! 273: $Loop:
! 274: bis r31, r31, r31 C U1 mt
! 275: cmpult r9, r8, r21 C L0 hi add => carry
! 276: addq r10, r20, r10 C U0 hi mul + carry
! 277: ldq r4, 0(r16) C L1
! 278:
! 279: bis r31, r31, r31 C U1 mt
! 280: subq r5, r11, r23 C L0 lo + acc
! 281: addq r10, r21, r10 C L0 hi mul + carry
! 282: ldq r2, 16(r17) C L1
! 283:
! 284: umulh r19, r3, r8 C U1
! 285: cmpult r5, r11, r20 C L0 lo add => carry
! 286: subq r23, r10, r28 C U0 hi add => answer
! 287: ldq r5, 8(r16) C L1
! 288:
! 289: mulq r19, r0, r9 C U1
! 290: cmpult r23, r10, r21 C L0 hi add => carry
! 291: addq r12, r20, r12 C U0 hi mul + carry
! 292: ldq r3, 24(r17) C L1
! 293:
! 294: umulh r19, r0, r10 C U1
! 295: subq r6, r13, r13 C L0 lo + acc
! 296: stq r22, -32(r16) C L0
! 297: stq r28, -24(r16) C L1
! 298:
! 299: bis r31, r31, r31 C L0 st slosh
! 300: mulq r19, r1, r11 C U1
! 301: bis r31, r31, r31 C L1 st slosh
! 302: addq r12, r21, r12 C U0 hi mul + carry
! 303:
! 304: cmpult r6, r13, r20 C L0 lo add => carry
! 305: bis r31, r31, r31 C U1 mt
! 306: lda r18, -1(r18) C L1 bookkeeping
! 307: subq r13, r12, r22 C U0 hi add => answer
! 308:
! 309: bis r31, r31, r31 C U1 mt
! 310: cmpult r13, r12, r21 C L0 hi add => carry
! 311: addq r14, r20, r14 C U0 hi mul + carry
! 312: ldq r6, 16(r16) C L1
! 313:
! 314: bis r31, r31, r31 C U1 mt
! 315: subq r7, r15, r23 C L0 lo + acc
! 316: addq r14, r21, r14 C U0 hi mul + carry
! 317: ldq r0, 32(r17) C L1
! 318:
! 319: umulh r19, r1, r12 C U1
! 320: cmpult r7, r15, r20 C L0 lo add => carry
! 321: subq r23, r14, r28 C U0 hi add => answer
! 322: ldq r7, 24(r16) C L1
! 323:
! 324: mulq r19, r2, r13 C U1
! 325: cmpult r23, r14, r21 C L0 hi add => carry
! 326: addq r8, r20, r8 C U0 hi mul + carry
! 327: ldq r1, 40(r17) C L1
! 328:
! 329: umulh r19, r2, r14 C U1
! 330: subq r4, r9, r9 C U0 lo + acc
! 331: stq r22, -16(r16) C L0
! 332: stq r28, -8(r16) C L1
! 333:
! 334: bis r31, r31, r31 C L0 st slosh
! 335: mulq r19, r3, r15 C U1
! 336: bis r31, r31, r31 C L1 st slosh
! 337: addq r8, r21, r8 C L0 hi mul + carry
! 338:
! 339: cmpult r4, r9, r20 C L0 lo add => carry
! 340: bis r31, r31, r31 C U1 mt
! 341: lda r17, 64(r17) C L1 bookkeeping
! 342: subq r9, r8, r22 C U0 hi add => answer
! 343:
! 344: bis r31, r31, r31 C U1 mt
! 345: cmpult r9, r8, r21 C L0 hi add => carry
! 346: addq r10, r20, r10 C U0 hi mul + carry
! 347: ldq r4, 32(r16) C L1
! 348:
! 349: bis r31, r31, r31 C U1 mt
! 350: subq r5, r11, r23 C L0 lo + acc
! 351: addq r10, r21, r10 C L0 hi mul + carry
! 352: ldq r2, -16(r17) C L1
! 353:
! 354: umulh r19, r3, r8 C U1
! 355: cmpult r5, r11, r20 C L0 lo add => carry
! 356: subq r23, r10, r28 C U0 hi add => answer
! 357: ldq r5, 40(r16) C L1
! 358:
! 359: mulq r19, r0, r9 C U1
! 360: cmpult r23, r10, r21 C L0 hi add => carry
! 361: addq r12, r20, r12 C U0 hi mul + carry
! 362: ldq r3, -8(r17) C L1
! 363:
! 364: umulh r19, r0, r10 C U1
! 365: subq r6, r13, r13 C L0 lo + acc
! 366: stq r22, 0(r16) C L0
! 367: stq r28, 8(r16) C L1
! 368:
! 369: bis r31, r31, r31 C L0 st slosh
! 370: mulq r19, r1, r11 C U1
! 371: bis r31, r31, r31 C L1 st slosh
! 372: addq r12, r21, r12 C U0 hi mul + carry
! 373:
! 374: cmpult r6, r13, r20 C L0 lo add => carry
! 375: bis r31, r31, r31 C U1 mt
! 376: lda r16, 64(r16) C L1 bookkeeping
! 377: subq r13, r12, r22 C U0 hi add => answer
! 378:
! 379: bis r31, r31, r31 C U1 mt
! 380: cmpult r13, r12, r21 C L0 hi add => carry
! 381: addq r14, r20, r14 C U0 hi mul + carry
! 382: ldq r6, -16(r16) C L1
! 383:
! 384: bis r31, r31, r31 C U1 mt
! 385: subq r7, r15, r23 C L0 lo + acc
! 386: addq r14, r21, r14 C U0 hi mul + carry
! 387: ldq r0, 0(r17) C L1
! 388:
! 389: umulh r19, r1, r12 C U1
! 390: cmpult r7, r15, r20 C L0 lo add => carry
! 391: subq r23, r14, r28 C U0 hi add => answer
! 392: ldq r7, -8(r16) C L1
! 393:
! 394: mulq r19, r2, r13 C U1
! 395: cmpult r23, r14, r21 C L0 hi add => carry
! 396: addq r8, r20, r8 C U0 hi mul + carry
! 397: ldq r1, 8(r17) C L1
! 398:
! 399: umulh r19, r2, r14 C U1
! 400: subq r4, r9, r9 C L0 lo + acc
! 401: stq r22, -48(r16) C L0
! 402: stq r28, -40(r16) C L1
! 403:
! 404: bis r31, r31, r31 C L0 st slosh
! 405: mulq r19, r3, r15 C U1
! 406: bis r31, r31, r31 C L1 st slosh
! 407: addq r8, r21, r8 C U0 hi mul + carry
! 408:
! 409: cmpult r4, r9, r20 C L0 lo add => carry
! 410: subq r9, r8, r22 C U0 hi add => answer
! 411: bis r31, r31, r31 C L1 mt
! 412: bgt r18, $Loop C U1 bookkeeping
! 413:
! 414: C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
! 415: $Lend:
! 416: cmpult r9, r8, r21 C L0 hi add => carry
! 417: addq r10, r20, r10 C U0 hi mul + carry
! 418: ldq r4, 0(r16) C L1
! 419: subq r5, r11, r23 C L0 lo + acc
! 420: addq r10, r21, r10 C L0 hi mul + carry
! 421: umulh r19, r3, r8 C U1
! 422: cmpult r5, r11, r20 C L0 lo add => carry
! 423: subq r23, r10, r28 C U0 hi add => answer
! 424: ldq r5, 8(r16) C L1
! 425: mulq r19, r0, r9 C U1
! 426: cmpult r23, r10, r21 C L0 hi add => carry
! 427: addq r12, r20, r12 C U0 hi mul + carry
! 428: umulh r19, r0, r10 C U1
! 429: subq r6, r13, r13 C L0 lo + acc
! 430: stq r22, -32(r16) C L0
! 431: stq r28, -24(r16) C L1
! 432: mulq r19, r1, r11 C U1
! 433: addq r12, r21, r12 C U0 hi mul + carry
! 434: cmpult r6, r13, r20 C L0 lo add => carry
! 435: subq r13, r12, r22 C U0 hi add => answer
! 436: cmpult r13, r12, r21 C L0 hi add => carry
! 437: addq r14, r20, r14 C U0 hi mul + carry
! 438: subq r7, r15, r23 C L0 lo + acc
! 439: addq r14, r21, r14 C U0 hi mul + carry
! 440: umulh r19, r1, r12 C U1
! 441: cmpult r7, r15, r20 C L0 lo add => carry
! 442: subq r23, r14, r28 C U0 hi add => answer
! 443: cmpult r23, r14, r21 C L0 hi add => carry
! 444: addq r8, r20, r8 C U0 hi mul + carry
! 445: subq r4, r9, r9 C U0 lo + acc
! 446: stq r22, -16(r16) C L0
! 447: stq r28, -8(r16) C L1
! 448: addq r8, r21, r8 C L0 hi mul + carry
! 449: cmpult r4, r9, r20 C L0 lo add => carry
! 450: subq r9, r8, r22 C U0 hi add => answer
! 451: cmpult r9, r8, r21 C L0 hi add => carry
! 452: addq r10, r20, r10 C U0 hi mul + carry
! 453: subq r5, r11, r23 C L0 lo + acc
! 454: addq r10, r21, r10 C L0 hi mul + carry
! 455: cmpult r5, r11, r20 C L0 lo add => carry
! 456: subq r23, r10, r28 C U0 hi add => answer
! 457: cmpult r23, r10, r21 C L0 hi add => carry
! 458: addq r12, r20, r12 C U0 hi mul + carry
! 459: stq r22, 0(r16) C L0
! 460: stq r28, 8(r16) C L1
! 461: addq r12, r21, r0 C U0 hi mul + carry
! 462:
! 463: ldq $9, 8($30)
! 464: ldq $10, 16($30)
! 465: ldq $11, 24($30)
! 466: ldq $12, 32($30)
! 467: ldq $13, 40($30)
! 468: ldq $14, 48($30)
! 469: ldq $15, 56($30)
! 470: lda $30, 240($30)
! 471: ret r31, (r26), 1
! 472: EPILOGUE(mpn_submul_1)
! 473: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>