Annotation of OpenXM_contrib/gmp/mpn/sparc64/sqr_diagonal.asm, Revision 1.1
1.1 ! ohara 1: dnl SPARC v9 64-bit mpn_sqr_diagonal.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4:
! 5: dnl This file is part of the GNU MP Library.
! 6:
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 8: dnl it under the terms of the GNU Lesser General Public License as published
! 9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 10: dnl your option) any later version.
! 11:
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 15: dnl License for more details.
! 16:
! 17: dnl You should have received a copy of the GNU Lesser General Public License
! 18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 20: dnl MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24: C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the
! 25: C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal
! 26: C code using the same algorithm. For 1-3 limbs, a special loop was generated,
! 27: C which causes performance problems in particular for 2 and 3 limbs.
! 28: C Ultimately, this should be replaced by hand-written code in the same software
! 29: C pipeline style as e.g., addmul_1.asm.
! 30:
! 31: ASM_START()
! 32: REGISTER(%g2,#scratch)
! 33: REGISTER(%g3,#scratch)
! 34: PROLOGUE(mpn_sqr_diagonal)
! 35: save %sp, -240, %sp
! 36:
! 37: sethi %hi(0x1ffc00), %o0
! 38: sethi %hi(0x3ffc00), %o1
! 39: add %o0, 1023, %o7
! 40: cmp %i2, 4
! 41: add %o1, 1023, %o4
! 42: or %g0, %i1, %g1
! 43: or %g0, %i0, %o0
! 44: bl,pn %xcc, .Lsmall
! 45: or %g0, 0, %g2
! 46:
! 47: ldx [%i1], %o1
! 48: add %i1, 24, %g1
! 49: or %g0, 3, %g2
! 50: srlx %o1, 42, %g3
! 51: stx %g3, [%sp+2279]
! 52: and %o1, %o7, %o2
! 53: stx %o2, [%sp+2263]
! 54: srlx %o1, 21, %o1
! 55: ldd [%sp+2279], %f0
! 56: and %o1, %o7, %o1
! 57: stx %o1, [%sp+2271]
! 58: ldx [%i1+8], %o2
! 59: fxtod %f0, %f12
! 60: srlx %o2, 21, %o1
! 61: and %o2, %o7, %g3
! 62: ldd [%sp+2263], %f2
! 63: fmuld %f12, %f12, %f10
! 64: srlx %o2, 42, %o2
! 65: ldd [%sp+2271], %f0
! 66: and %o1, %o7, %o1
! 67: fxtod %f2, %f8
! 68: stx %o2, [%sp+2279]
! 69: stx %o1, [%sp+2271]
! 70: fxtod %f0, %f0
! 71: stx %g3, [%sp+2263]
! 72: fdtox %f10, %f14
! 73: fmuld %f12, %f8, %f6
! 74: ldx [%i1+16], %o2
! 75: std %f14, [%sp+2255]
! 76: fmuld %f0, %f0, %f2
! 77: fmuld %f8, %f8, %f10
! 78: srlx %o2, 42, %o1
! 79: faddd %f6, %f6, %f6
! 80: fmuld %f12, %f0, %f12
! 81: fmuld %f0, %f8, %f8
! 82: ldd [%sp+2279], %f0
! 83: ldd [%sp+2263], %f4
! 84: fdtox %f10, %f10
! 85: std %f10, [%sp+2239]
! 86: faddd %f2, %f6, %f6
! 87: ldd [%sp+2271], %f2
! 88: fdtox %f12, %f12
! 89: std %f12, [%sp+2247]
! 90: fdtox %f8, %f8
! 91: std %f8, [%sp+2231]
! 92: fdtox %f6, %f6
! 93: std %f6, [%sp+2223]
! 94:
! 95: .Loop: srlx %o2, 21, %g3
! 96: stx %o1, [%sp+2279]
! 97: add %g2, 1, %g2
! 98: and %g3, %o7, %o1
! 99: ldx [%sp+2255], %g4
! 100: cmp %g2, %i2
! 101: stx %o1, [%sp+2271]
! 102: add %g1, 8, %g1
! 103: add %o0, 16, %o0
! 104: ldx [%sp+2239], %o1
! 105: fxtod %f0, %f10
! 106: fxtod %f4, %f14
! 107: ldx [%sp+2231], %i0
! 108: ldx [%sp+2223], %g5
! 109: ldx [%sp+2247], %g3
! 110: and %o2, %o7, %o2
! 111: fxtod %f2, %f8
! 112: fmuld %f10, %f10, %f0
! 113: stx %o2, [%sp+2263]
! 114: fmuld %f10, %f14, %f6
! 115: ldx [%g1-8], %o2
! 116: fmuld %f10, %f8, %f12
! 117: fdtox %f0, %f2
! 118: ldd [%sp+2279], %f0
! 119: fmuld %f8, %f8, %f4
! 120: faddd %f6, %f6, %f6
! 121: fmuld %f14, %f14, %f10
! 122: std %f2, [%sp+2255]
! 123: sllx %g4, 20, %g4
! 124: ldd [%sp+2271], %f2
! 125: fmuld %f8, %f14, %f8
! 126: sllx %i0, 22, %i1
! 127: fdtox %f12, %f12
! 128: std %f12, [%sp+2247]
! 129: sllx %g5, 42, %i0
! 130: add %o1, %i1, %o1
! 131: faddd %f4, %f6, %f6
! 132: ldd [%sp+2263], %f4
! 133: add %o1, %i0, %o1
! 134: add %g3, %g4, %g3
! 135: fdtox %f10, %f10
! 136: std %f10, [%sp+2239]
! 137: srlx %o1, 42, %g4
! 138: and %g5, %o4, %i0
! 139: fdtox %f8, %f8
! 140: std %f8, [%sp+2231]
! 141: srlx %g5, 22, %g5
! 142: sub %g4, %i0, %g4
! 143: fdtox %f6, %f6
! 144: std %f6, [%sp+2223]
! 145: srlx %g4, 63, %g4
! 146: add %g3, %g5, %g3
! 147: add %g3, %g4, %g3
! 148: stx %o1, [%o0-16]
! 149: srlx %o2, 42, %o1
! 150: bl,pt %xcc, .Loop
! 151: stx %g3, [%o0-8]
! 152:
! 153: stx %o1, [%sp+2279]
! 154: srlx %o2, 21, %o1
! 155: fxtod %f0, %f16
! 156: ldx [%sp+2223], %g3
! 157: fxtod %f4, %f6
! 158: and %o2, %o7, %o3
! 159: stx %o3, [%sp+2263]
! 160: fxtod %f2, %f4
! 161: and %o1, %o7, %o1
! 162: ldx [%sp+2231], %o2
! 163: sllx %g3, 42, %g4
! 164: fmuld %f16, %f16, %f14
! 165: stx %o1, [%sp+2271]
! 166: fmuld %f16, %f6, %f8
! 167: add %o0, 48, %o0
! 168: ldx [%sp+2239], %o1
! 169: sllx %o2, 22, %o2
! 170: fmuld %f4, %f4, %f10
! 171: ldx [%sp+2255], %o3
! 172: fdtox %f14, %f14
! 173: fmuld %f4, %f6, %f2
! 174: std %f14, [%sp+2255]
! 175: faddd %f8, %f8, %f12
! 176: add %o1, %o2, %o2
! 177: fmuld %f16, %f4, %f4
! 178: ldd [%sp+2279], %f0
! 179: sllx %o3, 20, %g5
! 180: add %o2, %g4, %o2
! 181: fmuld %f6, %f6, %f6
! 182: srlx %o2, 42, %o3
! 183: and %g3, %o4, %g4
! 184: srlx %g3, 22, %g3
! 185: faddd %f10, %f12, %f16
! 186: ldd [%sp+2271], %f12
! 187: ldd [%sp+2263], %f8
! 188: fxtod %f0, %f0
! 189: sub %o3, %g4, %o3
! 190: ldx [%sp+2247], %o1
! 191: srlx %o3, 63, %o3
! 192: fdtox %f2, %f10
! 193: fxtod %f8, %f8
! 194: std %f10, [%sp+2231]
! 195: fdtox %f6, %f6
! 196: std %f6, [%sp+2239]
! 197: add %o1, %g5, %o1
! 198: fmuld %f0, %f0, %f2
! 199: fdtox %f16, %f16
! 200: std %f16, [%sp+2223]
! 201: add %o1, %g3, %o1
! 202: fdtox %f4, %f4
! 203: std %f4, [%sp+2247]
! 204: fmuld %f0, %f8, %f10
! 205: fxtod %f12, %f12
! 206: add %o1, %o3, %o1
! 207: stx %o2, [%o0-48]
! 208: fmuld %f8, %f8, %f6
! 209: stx %o1, [%o0-40]
! 210: fdtox %f2, %f2
! 211: ldx [%sp+2231], %o2
! 212: faddd %f10, %f10, %f10
! 213: ldx [%sp+2223], %g3
! 214: fmuld %f12, %f12, %f4
! 215: fdtox %f6, %f6
! 216: ldx [%sp+2239], %o1
! 217: sllx %o2, 22, %o2
! 218: fmuld %f12, %f8, %f8
! 219: sllx %g3, 42, %g5
! 220: ldx [%sp+2255], %o3
! 221: fmuld %f0, %f12, %f0
! 222: add %o1, %o2, %o2
! 223: faddd %f4, %f10, %f4
! 224: ldx [%sp+2247], %o1
! 225: add %o2, %g5, %o2
! 226: and %g3, %o4, %g4
! 227: fdtox %f8, %f8
! 228: sllx %o3, 20, %g5
! 229: std %f8, [%sp+2231]
! 230: fdtox %f0, %f0
! 231: srlx %o2, 42, %o3
! 232: add %o1, %g5, %o1
! 233: fdtox %f4, %f4
! 234: srlx %g3, 22, %g3
! 235: sub %o3, %g4, %o3
! 236: std %f6, [%sp+2239]
! 237: std %f4, [%sp+2223]
! 238: srlx %o3, 63, %o3
! 239: add %o1, %g3, %o1
! 240: std %f2, [%sp+2255]
! 241: add %o1, %o3, %o1
! 242: std %f0, [%sp+2247]
! 243: stx %o2, [%o0-32]
! 244: stx %o1, [%o0-24]
! 245: ldx [%sp+2231], %o2
! 246: ldx [%sp+2223], %o3
! 247: ldx [%sp+2239], %o1
! 248: sllx %o2, 22, %o2
! 249: sllx %o3, 42, %g5
! 250: ldx [%sp+2255], %g4
! 251: and %o3, %o4, %g3
! 252: add %o1, %o2, %o2
! 253: ldx [%sp+2247], %o1
! 254: add %o2, %g5, %o2
! 255: stx %o2, [%o0-16]
! 256: sllx %g4, 20, %g4
! 257: srlx %o2, 42, %o2
! 258: add %o1, %g4, %o1
! 259: srlx %o3, 22, %o3
! 260: sub %o2, %g3, %o2
! 261: srlx %o2, 63, %o2
! 262: add %o1, %o3, %o1
! 263: add %o1, %o2, %o1
! 264: stx %o1, [%o0-8]
! 265: ret
! 266: restore %g0, %g0, %g0
! 267: .Lsmall:
! 268: ldx [%g1], %o2
! 269: .Loop0:
! 270: and %o2, %o7, %o1
! 271: stx %o1, [%sp+2263]
! 272: add %g2, 1, %g2
! 273: srlx %o2, 21, %o1
! 274: add %g1, 8, %g1
! 275: srlx %o2, 42, %o2
! 276: stx %o2, [%sp+2279]
! 277: and %o1, %o7, %o1
! 278: ldd [%sp+2263], %f0
! 279: cmp %g2, %i2
! 280: stx %o1, [%sp+2271]
! 281: fxtod %f0, %f6
! 282: ldd [%sp+2279], %f0
! 283: ldd [%sp+2271], %f4
! 284: fxtod %f0, %f2
! 285: fmuld %f6, %f6, %f0
! 286: fxtod %f4, %f10
! 287: fmuld %f2, %f6, %f4
! 288: fdtox %f0, %f0
! 289: std %f0, [%sp+2239]
! 290: fmuld %f10, %f6, %f8
! 291: fmuld %f10, %f10, %f0
! 292: faddd %f4, %f4, %f6
! 293: fmuld %f2, %f2, %f4
! 294: fdtox %f8, %f8
! 295: std %f8, [%sp+2231]
! 296: fmuld %f2, %f10, %f2
! 297: faddd %f0, %f6, %f0
! 298: fdtox %f4, %f4
! 299: std %f4, [%sp+2255]
! 300: fdtox %f2, %f2
! 301: std %f2, [%sp+2247]
! 302: fdtox %f0, %f0
! 303: std %f0, [%sp+2223]
! 304: ldx [%sp+2239], %o1
! 305: ldx [%sp+2255], %g4
! 306: ldx [%sp+2231], %o2
! 307: sllx %g4, 20, %g4
! 308: ldx [%sp+2223], %o3
! 309: sllx %o2, 22, %o2
! 310: sllx %o3, 42, %g5
! 311: add %o1, %o2, %o2
! 312: ldx [%sp+2247], %o1
! 313: add %o2, %g5, %o2
! 314: stx %o2, [%o0]
! 315: and %o3, %o4, %g3
! 316: srlx %o2, 42, %o2
! 317: add %o1, %g4, %o1
! 318: srlx %o3, 22, %o3
! 319: sub %o2, %g3, %o2
! 320: srlx %o2, 63, %o2
! 321: add %o1, %o3, %o1
! 322: add %o1, %o2, %o1
! 323: stx %o1, [%o0+8]
! 324: add %o0, 16, %o0
! 325: bl,a,pt %xcc, .Loop0
! 326: ldx [%g1], %o2
! 327: ret
! 328: restore %g0, %g0, %g0
! 329: EPILOGUE(mpn_sqr_diagonal)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>