Annotation of OpenXM_contrib/gmp/mpn/sparc32/v9/submul_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2: dnl subtract the result from a second limb vector.
3:
1.1.1.2 ! ohara 4: dnl Copyright 1998, 2000, 2001 Free Software Foundation, Inc.
1.1 maekawa 5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
1.1.1.2 ! ohara 25: C Algorithm: We use two floating-point multiplies per limb product, with the
! 26: C invariant v operand split into two 16-bit pieces, and the u operand split
! 27: C into 32-bit pieces. We convert the two 48-bit products and transfer them to
! 28: C the integer unit.
! 29:
! 30: C Speed: 7 cycles/limb on UltraSPARC-1/2.
! 31:
! 32: C Possible optimizations:
! 33: C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
! 34: C memory bandwidth limited, this could save 1.5 cycles/limb.
! 35: C 2. Unroll the inner loop. Since we already use alternate temporary areas,
! 36: C it is very straightforward to unroll, using an exit branch midways.
! 37: C Unrolling would allow deeper scheduling which could improve speed for L2
! 38: C cache case.
! 39: C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
! 40: C aren't sufficiently apart-scheduled with just two temp areas.
! 41: C 4. Do some cross-jumping to save about 1/2 the code size.
! 42: C 5. Specialize for particular v values. If its upper 16 bits are zero, we
! 43: C could save many operations.
1.1 maekawa 44:
1.1.1.2 ! ohara 45: C INPUT PARAMETERS
! 46: C rp i0
! 47: C up i1
! 48: C n i2
! 49: C v i3
1.1 maekawa 50:
1.1.1.2 ! ohara 51: define(`FSIZE',224)
1.1 maekawa 52:
1.1.1.2 ! ohara 53: ASM_START()
1.1 maekawa 54: PROLOGUE(mpn_submul_1)
1.1.1.2 ! ohara 55: add %sp, -FSIZE, %sp
! 56: sethi %hi(0xffff), %g1
! 57: srl %o3, 16, %g2
! 58: or %g1, %lo(0xffff), %g1
! 59: and %o3, %g1, %g1
! 60: stx %g1, [%sp+104]
! 61: stx %g2, [%sp+112]
! 62: ldd [%sp+104], %f6
! 63: ldd [%sp+112], %f8
! 64: fxtod %f6, %f6
! 65: fxtod %f8, %f8
! 66: ld [%sp+104], %f10 C zero f10
! 67:
! 68: mov 0, %g3 C cy = 0
! 69:
! 70: define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
! 71:
! 72: add %sp, 160, %o5 C point in scratch area
! 73: and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
! 74:
! 75: subcc %o2, 1, %o2
! 76: ld [%o1], %f11 C read up[i]
! 77: add %o1, 4, %o1 C up++
! 78: bne,pt %icc, .L_two_or_more
! 79: fxtod %f10, %f2
! 80: .L_1: fmuld %f2, %f8, %f16
! 81: fmuld %f2, %f6, %f4
! 82: fdtox %f16, %f14
! 83: fdtox %f4, %f12
! 84: std %f14, [%o5+16]
! 85: std %f12, [%o5+24]
! 86: ldx [%o5+16], %g2 C p16
! 87: ldx [%o5+24], %g1 C p0
! 88: lduw [%o0], %g5 C read rp[i]
! 89: sllx %g2, 16, %g4 C (p16 << 16)
! 90: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 91: sub %g5, %g4, %g4 C p += rp[i]
! 92: stw %g4, [%o0]
! 93: srlx %g4, 32, %g3 C new cy
! 94: sub %g0, %g3, %o0
! 95: retl
! 96: sub %sp, -FSIZE, %sp
! 97:
! 98: .align 16
! 99: .L_two_or_more:
! 100: subcc %o2, 1, %o2
! 101: ld [%o1], %f11 C read up[i]
! 102: fmuld %f2, %f8, %f16
! 103: fmuld %f2, %f6, %f4
! 104: add %o1, 4, %o1 C up++
! 105: bne,pt %icc, .L_three_or_more
! 106: fxtod %f10, %f2
! 107: .L_2: fdtox %f16, %f14
! 108: fdtox %f4, %f12
! 109: std %f14, [%o5+16]
! 110: fmuld %f2, %f8, %f16
! 111: std %f12, [%o5+24]
! 112: fmuld %f2, %f6, %f4
! 113: fdtox %f16, %f14
! 114: fdtox %f4, %f12
! 115: std %f14, [%o5+0]
! 116: std %f12, [%o5+8]
! 117: lduw [%o0], %g5 C read rp[i]
! 118: ldx [%o5+16], %g2 C p16
! 119: ldx [%o5+24], %g1 C p0
! 120: sllx %g2, 16, %g4 C (p16 << 16) * crossjmp pt
! 121: ldx [%o5+0], %g2 C p16
! 122: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 123: ldx [%o5+8], %g1 C p0
! 124: sub %g5, %g4, %g4 C p += rp[i]
! 125: stw %g4, [%o0+0]
! 126: srlx %g4, 32, %g3 C new cy
! 127: lduw [%o0+4], %g5 C read rp[i]
! 128: sub %g0, %g3, %g3
! 129: sllx %g2, 16, %g4 C (p16 << 16)
! 130: srl %g3, 0, %g3 C zero most significant 32 bits
! 131: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 132: add %g3, %g4, %g4 C p += cy
! 133: sub %g5, %g4, %g4 C p += rp[i]
! 134: stw %g4, [%o0+4]
! 135: srlx %g4, 32, %g3 C new cy
! 136: sub %g0, %g3, %o0
! 137: retl
! 138: sub %sp, -FSIZE, %sp
! 139:
! 140: .align 16
! 141: .L_three_or_more:
! 142: subcc %o2, 1, %o2
! 143: ld [%o1], %f11 C read up[i]
! 144: fdtox %f16, %f14
! 145: fdtox %f4, %f12
! 146: std %f14, [%o5+16]
! 147: fmuld %f2, %f8, %f16
! 148: std %f12, [%o5+24]
! 149: fmuld %f2, %f6, %f4
! 150: add %o1, 4, %o1 C up++
! 151: bne,pt %icc, .L_four_or_more
! 152: fxtod %f10, %f2
! 153: .L_3: fdtox %f16, %f14
! 154: fdtox %f4, %f12
! 155: std %f14, [%o5+0]
! 156: fmuld %f2, %f8, %f16
! 157: std %f12, [%o5+8]
! 158: fmuld %f2, %f6, %f4
! 159: fdtox %f16, %f14
! 160: ldx [%o5+16], %g2 C p16
! 161: fdtox %f4, %f12
! 162: ldx [%o5+24], %g1 C p0
! 163: std %f14, [%o5+16]
! 164: std %f12, [%o5+24]
! 165: lduw [%o0], %g5 C read rp[i]
! 166: sllx %g2, 16, %g4 C (p16 << 16)
! 167: ldx [%o5+0], %g2 C p16
! 168: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 169: ldx [%o5+8], %g1 C p0
! 170: sub %g5, %g4, %g4 C p += rp[i]
! 171: stw %g4, [%o0+0]
! 172: srlx %g4, 32, %g3 C new cy
! 173: lduw [%o0+4], %g5 C read rp[i]
! 174: sub %g0, %g3, %g3
! 175: sllx %g2, 16, %g4 C (p16 << 16)
! 176: ldx [%o5+16], %g2 C p16
! 177: srl %g3, 0, %g3 C zero most significant 32 bits
! 178: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 179: ldx [%o5+24], %g1 C p0
! 180: add %g3, %g4, %g4 C p += cy
! 181: sub %g5, %g4, %g4 C p += rp[i]
! 182: stw %g4, [%o0+4]
! 183: srlx %g4, 32, %g3 C new cy
! 184: lduw [%o0+8], %g5 C read rp[i]
! 185: sub %g0, %g3, %g3
! 186: sllx %g2, 16, %g4 C (p16 << 16)
! 187: srl %g3, 0, %g3 C zero most significant 32 bits
! 188: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 189: add %g3, %g4, %g4 C p += cy
! 190: sub %g5, %g4, %g4 C p += rp[i]
! 191: stw %g4, [%o0+8]
! 192: srlx %g4, 32, %g3 C new cy
! 193: sub %g0, %g3, %o0
! 194: retl
! 195: sub %sp, -FSIZE, %sp
! 196:
! 197: .align 16
! 198: .L_four_or_more:
! 199: subcc %o2, 1, %o2
! 200: ld [%o1], %f11 C read up[i]
! 201: fdtox %f16, %f14
! 202: fdtox %f4, %f12
! 203: std %f14, [%o5+0]
! 204: fmuld %f2, %f8, %f16
! 205: std %f12, [%o5+8]
! 206: fmuld %f2, %f6, %f4
! 207: add %o1, 4, %o1 C up++
! 208: bne,pt %icc, .L_five_or_more
! 209: fxtod %f10, %f2
! 210: .L_4: fdtox %f16, %f14
! 211: ldx [%o5+16], %g2 C p16
! 212: fdtox %f4, %f12
! 213: ldx [%o5+24], %g1 C p0
! 214: std %f14, [%o5+16]
! 215: fmuld %f2, %f8, %f16
! 216: std %f12, [%o5+24]
! 217: fmuld %f2, %f6, %f4
! 218: add %o1, 4, %o1 C up++
! 219: lduw [%o0], %g5 C read rp[i]
! 220: fdtox %f16, %f14
! 221: sllx %g2, 16, %g4 C (p16 << 16)
! 222: ldx [%o5+0], %g2 C p16
! 223: fdtox %f4, %f12
! 224: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 225: ldx [%o5+8], %g1 C p0
! 226: std %f14, [%o5+0]
! 227: sub %g5, %g4, %g4 C p += rp[i]
! 228: std %f12, [%o5+8]
! 229: stw %g4, [%o0+0]
! 230: srlx %g4, 32, %g3 C new cy
! 231: lduw [%o0+4], %g5 C read rp[i]
! 232: sub %g0, %g3, %g3
! 233: sllx %g2, 16, %g4 C (p16 << 16)
! 234: ldx [%o5+16], %g2 C p16
! 235: srl %g3, 0, %g3 C zero most significant 32 bits
! 236: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 237: ldx [%o5+24], %g1 C p0
! 238: add %g3, %g4, %g4 C p += cy
! 239: sub %g5, %g4, %g4 C p += rp[i]
! 240: stw %g4, [%o0+4]
! 241: srlx %g4, 32, %g3 C new cy
! 242: lduw [%o0+8], %g5 C read rp[i]
! 243: sub %g0, %g3, %g3
! 244: sllx %g2, 16, %g4 C (p16 << 16)
! 245: ldx [%o5+0], %g2 C p16
! 246: srl %g3, 0, %g3 C zero most significant 32 bits
! 247: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 248: ldx [%o5+8], %g1 C p0
! 249: add %g3, %g4, %g4 C p += cy
! 250: sub %g5, %g4, %g4 C p += rp[i]
! 251: stw %g4, [%o0+8]
! 252: srlx %g4, 32, %g3 C new cy
! 253: lduw [%o0+12], %g5 C read rp[i]
! 254: sub %g0, %g3, %g3
! 255: sllx %g2, 16, %g4 C (p16 << 16)
! 256: srl %g3, 0, %g3 C zero most significant 32 bits
! 257: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 258: add %g3, %g4, %g4 C p += cy
! 259: sub %g5, %g4, %g4 C p += rp[i]
! 260: stw %g4, [%o0+12]
! 261: srlx %g4, 32, %g3 C new cy
! 262: sub %g0, %g3, %o0
! 263: retl
! 264: sub %sp, -FSIZE, %sp
! 265:
! 266: .align 16
! 267: .L_five_or_more:
! 268: subcc %o2, 1, %o2
! 269: ld [%o1], %f11 C read up[i]
! 270: fdtox %f16, %f14
! 271: ldx [%o5+16], %g2 C p16
! 272: fdtox %f4, %f12
! 273: ldx [%o5+24], %g1 C p0
! 274: std %f14, [%o5+16]
! 275: fmuld %f2, %f8, %f16
! 276: std %f12, [%o5+24]
! 277: fmuld %f2, %f6, %f4
! 278: add %o1, 4, %o1 C up++
! 279: lduw [%o0], %g5 C read rp[i]
! 280: bne,pt %icc, .Loop
! 281: fxtod %f10, %f2
! 282: b,a .L_out_5
1.1 maekawa 283:
1.1.1.2 ! ohara 284: C BEGIN MAIN LOOP
1.1 maekawa 285: .align 16
1.1.1.2 ! ohara 286: C -- 0
! 287: .Loop: sub %g0, %g3, %g3
! 288: subcc %o2, 1, %o2
! 289: ld [%o1], %f11 C read up[i]
! 290: fdtox %f16, %f14
! 291: C -- 1
! 292: sllx %g2, 16, %g4 C (p16 << 16)
! 293: add %o0, 4, %o0 C rp++
! 294: ldx [%o5+0], %g2 C p16
! 295: fdtox %f4, %f12
! 296: C -- 2
! 297: srl %g3, 0, %g3 C zero most significant 32 bits
! 298: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 299: ldx [%o5+8], %g1 C p0
! 300: fanop
! 301: C -- 3
! 302: nop
! 303: add %g3, %g4, %g4 C p += cy
! 304: std %f14, [%o5+0]
! 305: fmuld %f2, %f8, %f16
! 306: C -- 4
! 307: nop
! 308: sub %g5, %g4, %g4 C p += rp[i]
! 309: std %f12, [%o5+8]
! 310: fmuld %f2, %f6, %f4
! 311: C -- 5
! 312: xor %o5, 16, %o5 C alternate scratch variables
! 313: add %o1, 4, %o1 C up++
! 314: stw %g4, [%o0-4]
! 315: fanop
! 316: C -- 6
! 317: srlx %g4, 32, %g3 C new cy
! 318: lduw [%o0], %g5 C read rp[i]
! 319: bne,pt %icc, .Loop
! 320: fxtod %f10, %f2
! 321: C END MAIN LOOP
! 322:
! 323: .L_out_5:
! 324: sub %g0, %g3, %g3
! 325: fdtox %f16, %f14
! 326: sllx %g2, 16, %g4 C (p16 << 16)
! 327: ldx [%o5+0], %g2 C p16
! 328: fdtox %f4, %f12
! 329: srl %g3, 0, %g3 C zero most significant 32 bits
! 330: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 331: ldx [%o5+8], %g1 C p0
! 332: add %g4, %g3, %g4 C p += cy
! 333: std %f14, [%o5+0]
! 334: fmuld %f2, %f8, %f16
! 335: sub %g5, %g4, %g4 C p += rp[i]
! 336: std %f12, [%o5+8]
! 337: fmuld %f2, %f6, %f4
! 338: xor %o5, 16, %o5
! 339: stw %g4, [%o0+0]
! 340: srlx %g4, 32, %g3 C new cy
! 341: lduw [%o0+4], %g5 C read rp[i]
! 342:
! 343: sub %g0, %g3, %g3
! 344: fdtox %f16, %f14
! 345: sllx %g2, 16, %g4 C (p16 << 16)
! 346: ldx [%o5+0], %g2 C p16
! 347: fdtox %f4, %f12
! 348: srl %g3, 0, %g3 C zero most significant 32 bits
! 349: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 350: ldx [%o5+8], %g1 C p0
! 351: add %g3, %g4, %g4 C p += cy
! 352: std %f14, [%o5+0]
! 353: sub %g5, %g4, %g4 C p += rp[i]
! 354: std %f12, [%o5+8]
! 355: xor %o5, 16, %o5
! 356: stw %g4, [%o0+4]
! 357: srlx %g4, 32, %g3 C new cy
! 358: lduw [%o0+8], %g5 C read rp[i]
! 359:
! 360: sub %g0, %g3, %g3
! 361: sllx %g2, 16, %g4 C (p16 << 16)
! 362: ldx [%o5+0], %g2 C p16
! 363: srl %g3, 0, %g3 C zero most significant 32 bits
! 364: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 365: ldx [%o5+8], %g1 C p0
! 366: add %g3, %g4, %g4 C p += cy
! 367: sub %g5, %g4, %g4 C p += rp[i]
! 368: xor %o5, 16, %o5
! 369: stw %g4, [%o0+8]
! 370: srlx %g4, 32, %g3 C new cy
! 371: lduw [%o0+12], %g5 C read rp[i]
! 372:
! 373: sub %g0, %g3, %g3
! 374: sllx %g2, 16, %g4 C (p16 << 16)
! 375: ldx [%o5+0], %g2 C p16
! 376: srl %g3, 0, %g3 C zero most significant 32 bits
! 377: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 378: ldx [%o5+8], %g1 C p0
! 379: add %g3, %g4, %g4 C p += cy
! 380: sub %g5, %g4, %g4 C p += rp[i]
! 381: stw %g4, [%o0+12]
! 382: srlx %g4, 32, %g3 C new cy
! 383: lduw [%o0+16], %g5 C read rp[i]
! 384:
! 385: sub %g0, %g3, %g3
! 386: sllx %g2, 16, %g4 C (p16 << 16)
! 387: srl %g3, 0, %g3 C zero most significant 32 bits
! 388: add %g1, %g4, %g4 C p = p0 + (p16 << 16)
! 389: add %g3, %g4, %g4 C p += cy
! 390: sub %g5, %g4, %g4 C p += rp[i]
! 391: stw %g4, [%o0+16]
! 392: srlx %g4, 32, %g3 C new cy
! 393:
! 394: sub %g0, %g3, %o0
! 395: retl
! 396: sub %sp, -FSIZE, %sp
1.1 maekawa 397: EPILOGUE(mpn_submul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>