Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/mul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
! 2: dnl result in a second limb vector.
! 3:
! 4: dnl Copyright 2000, 2001 Free Software Foundation, Inc.
! 5:
! 6: dnl This file is part of the GNU MP Library.
! 7:
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 9: dnl it under the terms of the GNU Lesser General Public License as published
! 10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 11: dnl your option) any later version.
! 12:
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: dnl License for more details.
! 17:
! 18: dnl You should have received a copy of the GNU Lesser General Public License
! 19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: dnl MA 02111-1307, USA.
! 22:
! 23: include(`../config.m4')
! 24:
! 25: C INPUT PARAMETERS
! 26: C res_ptr r16
! 27: C s1_ptr r17
! 28: C size r18
! 29: C s2_limb r19
! 30:
! 31: C This code runs at 2.25 cycles/limb on EV6.
! 32:
! 33: C This code was written in close cooperation with ev6 pipeline expert
! 34: C Steve Root. Any errors are tege's fault, though.
! 35:
! 36: C Code structure:
! 37:
! 38: C code for n < 8
! 39: C code for n > 8 code for (n mod 8)
! 40: C code for (n div 8) feed-in code
! 41: C 8-way unrolled loop
! 42: C wind-down code
! 43:
! 44: C Some notes about unrolled loop:
! 45: C
! 46: C r1-r8 multiplies and workup
! 47: C r21-r28 multiplies and workup
! 48: C r9-r12 loads
! 49: C r0 -1
! 50: C r20,r29,r13-r15 scramble
! 51: C
! 52: C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
! 53: C put-the-carry-into-hi. The idea is that these branches are very rarely
! 54: C taken, and since a non-taken branch consumes no resurces, that is better
! 55: C than an addq.
! 56: C
! 57: C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
! 58: C add NEXT cycle #09 which feeds a store in NEXT cycle #02
! 59:
! 60: C The code could use some further work:
! 61: C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
! 62: C faster than this for size < 3.
! 63: C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
! 64: C that is too costly.
! 65: C 3. Consider using 4-way unrolling, even if that runs slower.
! 66: C 4. Reduce register usage. In particular, try to avoid using r29.
! 67:
! 68:
! 69: ASM_START()
! 70: PROLOGUE(mpn_mul_1)
! 71: cmpult r18, 8, r1
! 72: beq r1, $Large
! 73: $Lsmall:
! 74: ldq r2,0(r17) C r2 = s1_limb
! 75: lda r18,-1(r18) C size--
! 76: mulq r2,r19,r3 C r3 = prod_low
! 77: bic r31,r31,r4 C clear cy_limb
! 78: umulh r2,r19,r0 C r0 = prod_high
! 79: beq r18,$Le1a C jump if size was == 1
! 80: ldq r2,8(r17) C r2 = s1_limb
! 81: lda r18,-1(r18) C size--
! 82: stq r3,0(r16)
! 83: beq r18,$Le2a C jump if size was == 2
! 84: ALIGN(8)
! 85: $Lopa: mulq r2,r19,r3 C r3 = prod_low
! 86: addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
! 87: lda r18,-1(r18) C size--
! 88: umulh r2,r19,r4 C r4 = cy_limb
! 89: ldq r2,16(r17) C r2 = s1_limb
! 90: lda r17,8(r17) C s1_ptr++
! 91: addq r3,r0,r3 C r3 = cy_limb + prod_low
! 92: stq r3,8(r16)
! 93: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
! 94: lda r16,8(r16) C res_ptr++
! 95: bne r18,$Lopa
! 96:
! 97: $Le2a: mulq r2,r19,r3 C r3 = prod_low
! 98: addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
! 99: umulh r2,r19,r4 C r4 = cy_limb
! 100: addq r3,r0,r3 C r3 = cy_limb + prod_low
! 101: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
! 102: stq r3,8(r16)
! 103: addq r4,r0,r0 C cy_limb = prod_high + cy
! 104: ret r31,(r26),1
! 105: $Le1a: stq r3,0(r16)
! 106: ret r31,(r26),1
! 107:
! 108: $Large:
! 109: lda r30, -224(r30)
! 110: stq r26, 0(r30)
! 111: stq r9, 8(r30)
! 112: stq r10, 16(r30)
! 113: stq r11, 24(r30)
! 114: stq r12, 32(r30)
! 115: stq r13, 40(r30)
! 116: stq r14, 48(r30)
! 117: stq r15, 56(r30)
! 118: stq r29, 64(r30)
! 119:
! 120: and r18, 7, r20 C count for the first loop, 0-7
! 121: srl r18, 3, r18 C count for unrolled loop
! 122: bis r31, r31, r21
! 123: beq r20, $L_8_or_more C skip first loop
! 124:
! 125: $L_9_or_more:
! 126: ldq r2,0(r17) C r2 = s1_limb
! 127: lda r17,8(r17) C s1_ptr++
! 128: lda r20,-1(r20) C size--
! 129: mulq r2,r19,r3 C r3 = prod_low
! 130: umulh r2,r19,r21 C r21 = prod_high
! 131: beq r20,$Le1b C jump if size was == 1
! 132: bis r31, r31, r0 C FIXME: shouldtn't need this
! 133: ldq r2,0(r17) C r2 = s1_limb
! 134: lda r17,8(r17) C s1_ptr++
! 135: lda r20,-1(r20) C size--
! 136: stq r3,0(r16)
! 137: lda r16,8(r16) C res_ptr++
! 138: beq r20,$Le2b C jump if size was == 2
! 139: ALIGN(8)
! 140: $Lopb: mulq r2,r19,r3 C r3 = prod_low
! 141: addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
! 142: lda r20,-1(r20) C size--
! 143: umulh r2,r19,r21 C r21 = prod_high
! 144: ldq r2,0(r17) C r2 = s1_limb
! 145: lda r17,8(r17) C s1_ptr++
! 146: addq r3,r0,r3 C r3 = cy_limb + prod_low
! 147: stq r3,0(r16)
! 148: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
! 149: lda r16,8(r16) C res_ptr++
! 150: bne r20,$Lopb
! 151:
! 152: $Le2b: mulq r2,r19,r3 C r3 = prod_low
! 153: addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
! 154: umulh r2,r19,r21 C r21 = prod_high
! 155: addq r3,r0,r3 C r3 = cy_limb + prod_low
! 156: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
! 157: stq r3,0(r16)
! 158: lda r16,8(r16) C res_ptr++
! 159: addq r21,r0,r21 C cy_limb = prod_high + cy
! 160: br r31, $L_8_or_more
! 161: $Le1b: stq r3,0(r16)
! 162: lda r16,8(r16) C res_ptr++
! 163:
! 164: $L_8_or_more:
! 165: lda r0, -1(r31) C put -1 in r0, for tricky loop control
! 166: lda r17, -32(r17) C L1 bookkeeping
! 167: lda r18, -1(r18) C decrement count
! 168:
! 169: ldq r9, 32(r17) C L1
! 170: ldq r10, 40(r17) C L1
! 171: mulq r9, r19, r22 C U1 #07
! 172: ldq r11, 48(r17) C L1
! 173: umulh r9, r19, r23 C U1 #08
! 174: ldq r12, 56(r17) C L1
! 175: mulq r10, r19, r24 C U1 #09
! 176: ldq r9, 64(r17) C L1
! 177:
! 178: lda r17, 64(r17) C L1 bookkeeping
! 179:
! 180: umulh r10, r19, r25 C U1 #11
! 181: mulq r11, r19, r26 C U1 #12
! 182: umulh r11, r19, r27 C U1 #13
! 183: mulq r12, r19, r28 C U1 #14
! 184: ldq r10, 8(r17) C L1
! 185: umulh r12, r19, r1 C U1 #15
! 186: ldq r11, 16(r17) C L1
! 187: mulq r9, r19, r2 C U1 #16
! 188: ldq r12, 24(r17) C L1
! 189: umulh r9, r19, r3 C U1 #17
! 190: addq r21, r22, r13 C L1 mov
! 191: mulq r10, r19, r4 C U1 #18
! 192: addq r23, r24, r22 C L0 sum 2 mul's
! 193: cmpult r13, r21, r14 C L1 carry from sum
! 194: bgt r18, $L_16_or_more
! 195:
! 196: cmpult r22, r24, r24 C U0 carry from sum
! 197: umulh r10, r19, r5 C U1 #02
! 198: addq r25, r26, r23 C U0 sum 2 mul's
! 199: mulq r11, r19, r6 C U1 #03
! 200: cmpult r23, r26, r25 C U0 carry from sum
! 201: umulh r11, r19, r7 C U1 #04
! 202: addq r27, r28, r28 C U0 sum 2 mul's
! 203: mulq r12, r19, r8 C U1 #05
! 204: cmpult r28, r27, r15 C L0 carry from sum
! 205: lda r16, 32(r16) C L1 bookkeeping
! 206: addq r13, r31, r13 C U0 start carry cascade
! 207: umulh r12, r19, r21 C U1 #06
! 208: br r31, ret0c
! 209:
! 210: $L_16_or_more:
! 211: C ---------------------------------------------------------------
! 212: subq r18,1,r18
! 213: cmpult r22, r24, r24 C U0 carry from sum
! 214: ldq r9, 32(r17) C L1
! 215:
! 216: umulh r10, r19, r5 C U1 #02
! 217: addq r25, r26, r23 C U0 sum 2 mul's
! 218: mulq r11, r19, r6 C U1 #03
! 219: cmpult r23, r26, r25 C U0 carry from sum
! 220: umulh r11, r19, r7 C U1 #04
! 221: addq r27, r28, r28 C U0 sum 2 mul's
! 222: mulq r12, r19, r8 C U1 #05
! 223: cmpult r28, r27, r15 C L0 carry from sum
! 224: lda r16, 32(r16) C L1 bookkeeping
! 225: addq r13, r31, r13 C U0 start carry cascade
! 226:
! 227: umulh r12, r19, r21 C U1 #06
! 228: C beq r13, fix0w C U0
! 229: ret0w: addq r22, r14, r26 C L0
! 230: ldq r10, 40(r17) C L1
! 231:
! 232: mulq r9, r19, r22 C U1 #07
! 233: beq r26, fix1w C U0
! 234: ret1w: addq r23, r24, r27 C L0
! 235: ldq r11, 48(r17) C L1
! 236:
! 237: umulh r9, r19, r23 C U1 #08
! 238: beq r27, fix2w C U0
! 239: ret2w: addq r28, r25, r28 C L0
! 240: ldq r12, 56(r17) C L1
! 241:
! 242: mulq r10, r19, r24 C U1 #09
! 243: beq r28, fix3w C U0
! 244: ret3w: addq r1, r2, r20 C L0 sum 2 mul's
! 245: ldq r9, 64(r17) C L1
! 246:
! 247: addq r3, r4, r2 C L0 #10 2 mul's
! 248: lda r17, 64(r17) C L1 bookkeeping
! 249: cmpult r20, r1, r29 C U0 carry from sum
! 250:
! 251: umulh r10, r19, r25 C U1 #11
! 252: cmpult r2, r4, r4 C U0 carry from sum
! 253: stq r13, -32(r16) C L0
! 254: stq r26, -24(r16) C L1
! 255:
! 256: mulq r11, r19, r26 C U1 #12
! 257: addq r5, r6, r14 C U0 sum 2 mul's
! 258: stq r27, -16(r16) C L0
! 259: stq r28, -8(r16) C L1
! 260:
! 261: umulh r11, r19, r27 C U1 #13
! 262: cmpult r14, r6, r3 C U0 carry from sum
! 263: C could do cross-jumping here:
! 264: C bra $L_middle_of_unrolled_loop
! 265: mulq r12, r19, r28 C U1 #14
! 266: addq r7, r3, r5 C L0 eat carry
! 267: addq r20, r15, r20 C U0 carry cascade
! 268: ldq r10, 8(r17) C L1
! 269:
! 270: umulh r12, r19, r1 C U1 #15
! 271: beq r20, fix4 C U0
! 272: ret4w: addq r2, r29, r6 C L0
! 273: ldq r11, 16(r17) C L1
! 274:
! 275: mulq r9, r19, r2 C U1 #16
! 276: beq r6, fix5 C U0
! 277: ret5w: addq r14, r4, r7 C L0
! 278: ldq r12, 24(r17) C L1
! 279:
! 280: umulh r9, r19, r3 C U1 #17
! 281: beq r7, fix6 C U0
! 282: ret6w: addq r5, r8, r8 C L0 sum 2
! 283: addq r21, r22, r13 C L1 sum 2 mul's
! 284:
! 285: mulq r10, r19, r4 C U1 #18
! 286: addq r23, r24, r22 C L0 sum 2 mul's
! 287: cmpult r13, r21, r14 C L1 carry from sum
! 288: ble r18, $Lend C U0
! 289: C ---------------------------------------------------------------
! 290: ALIGN(16)
! 291: $Loop:
! 292: umulh r0, r18, r18 C U1 #01 decrement r18!
! 293: cmpult r8, r5, r29 C L0 carry from last bunch
! 294: cmpult r22, r24, r24 C U0 carry from sum
! 295: ldq r9, 32(r17) C L1
! 296:
! 297: umulh r10, r19, r5 C U1 #02
! 298: addq r25, r26, r23 C U0 sum 2 mul's
! 299: stq r20, 0(r16) C L0
! 300: stq r6, 8(r16) C L1
! 301:
! 302: mulq r11, r19, r6 C U1 #03
! 303: cmpult r23, r26, r25 C U0 carry from sum
! 304: stq r7, 16(r16) C L0
! 305: stq r8, 24(r16) C L1
! 306:
! 307: umulh r11, r19, r7 C U1 #04
! 308: bis r31, r31, r31 C L0 st slosh
! 309: bis r31, r31, r31 C L1 st slosh
! 310: addq r27, r28, r28 C U0 sum 2 mul's
! 311:
! 312: mulq r12, r19, r8 C U1 #05
! 313: cmpult r28, r27, r15 C L0 carry from sum
! 314: lda r16, 64(r16) C L1 bookkeeping
! 315: addq r13, r29, r13 C U0 start carry cascade
! 316:
! 317: umulh r12, r19, r21 C U1 #06
! 318: beq r13, fix0 C U0
! 319: ret0: addq r22, r14, r26 C L0
! 320: ldq r10, 40(r17) C L1
! 321:
! 322: mulq r9, r19, r22 C U1 #07
! 323: beq r26, fix1 C U0
! 324: ret1: addq r23, r24, r27 C L0
! 325: ldq r11, 48(r17) C L1
! 326:
! 327: umulh r9, r19, r23 C U1 #08
! 328: beq r27, fix2 C U0
! 329: ret2: addq r28, r25, r28 C L0
! 330: ldq r12, 56(r17) C L1
! 331:
! 332: mulq r10, r19, r24 C U1 #09
! 333: beq r28, fix3 C U0
! 334: ret3: addq r1, r2, r20 C L0 sum 2 mul's
! 335: ldq r9, 64(r17) C L1
! 336:
! 337: addq r3, r4, r2 C L0 #10 2 mul's
! 338: bis r31, r31, r31 C U1 mul hole
! 339: lda r17, 64(r17) C L1 bookkeeping
! 340: cmpult r20, r1, r29 C U0 carry from sum
! 341:
! 342: umulh r10, r19, r25 C U1 #11
! 343: cmpult r2, r4, r4 C U0 carry from sum
! 344: stq r13, -32(r16) C L0
! 345: stq r26, -24(r16) C L1
! 346:
! 347: mulq r11, r19, r26 C U1 #12
! 348: addq r5, r6, r14 C U0 sum 2 mul's
! 349: stq r27, -16(r16) C L0
! 350: stq r28, -8(r16) C L1
! 351:
! 352: umulh r11, r19, r27 C U1 #13
! 353: bis r31, r31, r31 C L0 st slosh
! 354: bis r31, r31, r31 C L1 st slosh
! 355: cmpult r14, r6, r3 C U0 carry from sum
! 356: $L_middle_of_unrolled_loop:
! 357: mulq r12, r19, r28 C U1 #14
! 358: addq r7, r3, r5 C L0 eat carry
! 359: addq r20, r15, r20 C U0 carry cascade
! 360: ldq r10, 8(r17) C L1
! 361:
! 362: umulh r12, r19, r1 C U1 #15
! 363: beq r20, fix4 C U0
! 364: ret4: addq r2, r29, r6 C L0
! 365: ldq r11, 16(r17) C L1
! 366:
! 367: mulq r9, r19, r2 C U1 #16
! 368: beq r6, fix5 C U0
! 369: ret5: addq r14, r4, r7 C L0
! 370: ldq r12, 24(r17) C L1
! 371:
! 372: umulh r9, r19, r3 C U1 #17
! 373: beq r7, fix6 C U0
! 374: ret6: addq r5, r8, r8 C L0 sum 2
! 375: addq r21, r22, r13 C L1 sum 2 mul's
! 376:
! 377: mulq r10, r19, r4 C U1 #18
! 378: addq r23, r24, r22 C L0 sum 2 mul's
! 379: cmpult r13, r21, r14 C L1 carry from sum
! 380: bgt r18, $Loop C U0
! 381: C ---------------------------------------------------------------
! 382: $Lend:
! 383: cmpult r8, r5, r29 C L0 carry from last bunch
! 384: cmpult r22, r24, r24 C U0 carry from sum
! 385:
! 386: umulh r10, r19, r5 C U1 #02
! 387: addq r25, r26, r23 C U0 sum 2 mul's
! 388: stq r20, 0(r16) C L0
! 389: stq r6, 8(r16) C L1
! 390:
! 391: mulq r11, r19, r6 C U1 #03
! 392: cmpult r23, r26, r25 C U0 carry from sum
! 393: stq r7, 16(r16) C L0
! 394: stq r8, 24(r16) C L1
! 395:
! 396: umulh r11, r19, r7 C U1 #04
! 397: addq r27, r28, r28 C U0 sum 2 mul's
! 398:
! 399: mulq r12, r19, r8 C U1 #05
! 400: cmpult r28, r27, r15 C L0 carry from sum
! 401: lda r16, 64(r16) C L1 bookkeeping
! 402: addq r13, r29, r13 C U0 start carry cascade
! 403:
! 404: umulh r12, r19, r21 C U1 #06
! 405: beq r13, fix0c C U0
! 406: ret0c: addq r22, r14, r26 C L0
! 407: beq r26, fix1c C U0
! 408: ret1c: addq r23, r24, r27 C L0
! 409: beq r27, fix2c C U0
! 410: ret2c: addq r28, r25, r28 C L0
! 411: beq r28, fix3c C U0
! 412: ret3c: addq r1, r2, r20 C L0 sum 2 mul's
! 413: addq r3, r4, r2 C L0 #10 2 mul's
! 414: lda r17, 64(r17) C L1 bookkeeping
! 415: cmpult r20, r1, r29 C U0 carry from sum
! 416: cmpult r2, r4, r4 C U0 carry from sum
! 417: stq r13, -32(r16) C L0
! 418: stq r26, -24(r16) C L1
! 419: addq r5, r6, r14 C U0 sum 2 mul's
! 420: stq r27, -16(r16) C L0
! 421: stq r28, -8(r16) C L1
! 422: cmpult r14, r6, r3 C U0 carry from sum
! 423: addq r7, r3, r5 C L0 eat carry
! 424: addq r20, r15, r20 C U0 carry cascade
! 425: beq r20, fix4c C U0
! 426: ret4c: addq r2, r29, r6 C L0
! 427: beq r6, fix5c C U0
! 428: ret5c: addq r14, r4, r7 C L0
! 429: beq r7, fix6c C U0
! 430: ret6c: addq r5, r8, r8 C L0 sum 2
! 431: cmpult r8, r5, r29 C L0 carry from last bunch
! 432: stq r20, 0(r16) C L0
! 433: stq r6, 8(r16) C L1
! 434: stq r7, 16(r16) C L0
! 435: stq r8, 24(r16) C L1
! 436: addq r29, r21, r0
! 437:
! 438: ldq r26, 0(r30)
! 439: ldq r9, 8(r30)
! 440: ldq r10, 16(r30)
! 441: ldq r11, 24(r30)
! 442: ldq r12, 32(r30)
! 443: ldq r13, 40(r30)
! 444: ldq r14, 48(r30)
! 445: ldq r15, 56(r30)
! 446: ldq r29, 64(r30)
! 447: lda r30, 224(r30)
! 448: ret r31, (r26), 1
! 449:
! 450: C fix0w: bis r14, r29, r14 C join carries
! 451: C br r31, ret0w
! 452: fix1w: bis r24, r14, r24 C join carries
! 453: br r31, ret1w
! 454: fix2w: bis r25, r24, r25 C join carries
! 455: br r31, ret2w
! 456: fix3w: bis r15, r25, r15 C join carries
! 457: br r31, ret3w
! 458: fix0: bis r14, r29, r14 C join carries
! 459: br r31, ret0
! 460: fix1: bis r24, r14, r24 C join carries
! 461: br r31, ret1
! 462: fix2: bis r25, r24, r25 C join carries
! 463: br r31, ret2
! 464: fix3: bis r15, r25, r15 C join carries
! 465: br r31, ret3
! 466: fix4: bis r29, r15, r29 C join carries
! 467: br r31, ret4
! 468: fix5: bis r4, r29, r4 C join carries
! 469: br r31, ret5
! 470: fix6: addq r5, r4, r5 C can't carry twice!
! 471: br r31, ret6
! 472: fix0c: bis r14, r29, r14 C join carries
! 473: br r31, ret0c
! 474: fix1c: bis r24, r14, r24 C join carries
! 475: br r31, ret1c
! 476: fix2c: bis r25, r24, r25 C join carries
! 477: br r31, ret2c
! 478: fix3c: bis r15, r25, r15 C join carries
! 479: br r31, ret3c
! 480: fix4c: bis r29, r15, r29 C join carries
! 481: br r31, ret4c
! 482: fix5c: bis r4, r29, r4 C join carries
! 483: br r31, ret5c
! 484: fix6c: addq r5, r4, r5 C can't carry twice!
! 485: br r31, ret6c
! 486:
! 487: EPILOGUE(mpn_mul_1)
! 488: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>