Annotation of OpenXM_contrib/gmp/mpn/sparc32/v9/sqr_diagonal.asm, Revision 1.1
1.1 ! ohara 1: dnl SPARC v9 32-bit mpn_sqr_diagonal.
! 2:
! 3: dnl Copyright 2001 Free Software Foundation, Inc.
! 4:
! 5: dnl This file is part of the GNU MP Library.
! 6:
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 8: dnl it under the terms of the GNU Lesser General Public License as published
! 9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 10: dnl your option) any later version.
! 11:
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 15: dnl License for more details.
! 16:
! 17: dnl You should have received a copy of the GNU Lesser General Public License
! 18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 20: dnl MA 02111-1307, USA.
! 21:
! 22:
! 23: include(`../config.m4')
! 24:
! 25: C INPUT PARAMETERS
! 26: C rp i0
! 27: C up i1
! 28: C n i2
! 29:
! 30: C This code uses a very deep software pipeline, due to the need for moving data
! 31: C forth and back between the integer registers and floating-point registers.
! 32: C
! 33: C The code is very large, probably unnecessarily large. Cross-jumping
! 34: C transformation of the wind-down code could reduce the code size considerably.
! 35: C
! 36: C A VIS variant of this code would make the pipeline less deep, since the
! 37: C masking now done in the integer unit could take place in the floating-point
! 38: C unit using the FAND instruction. It would be possible to save several cycles
! 39: C too.
! 40: C
! 41: C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
! 42: C not much slower from the Ecache. It would perhaps be possible to shave off
! 43: C one cycle, but not easily. We cannot do better than 10 cycles/limb with the
! 44: C used instructions, since we have 10 memory operations per limb. But a VIS
! 45: C variant could run three cycles faster than the corresponding non-VIS code.
! 46:
! 47: C This is non-pipelined code showing the algorithm:
! 48: C
! 49: C L(loop):
! 50: C lduw [up+0],%g4 C 00000000hhhhllll
! 51: C sllx %g4,16,%g3 C 0000hhhhllll0000
! 52: C or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 53: C andn %g2,%g5,%g2 C 0000hhhh0000llll
! 54: C stx %g2,[%fp+80]
! 55: C ldd [%fp+80],%f0
! 56: C fitod %f0,%f4 C hi16
! 57: C fitod %f1,%f6 C lo16
! 58: C ld [up+0],%f9
! 59: C fxtod %f8,%f2
! 60: C fmuld %f2,%f4,%f4
! 61: C fmuld %f2,%f6,%f6
! 62: C fdtox %f4,%f4
! 63: C fdtox %f6,%f6
! 64: C std %f4,[%fp-24]
! 65: C std %f6,[%fp-16]
! 66: C ldx [%fp-24],%g2
! 67: C ldx [%fp-16],%g1
! 68: C sllx %g2,16,%g2
! 69: C add %g2,%g1,%g1
! 70: C stw %g1,[rp+0]
! 71: C srlx %g1,32,%l0
! 72: C stw %l0,[rp+4]
! 73: C add up,4,up
! 74: C subcc n,1,n
! 75: C bne,pt %icc,L(loop)
! 76: C add rp,8,rp
! 77:
! 78: define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe
! 79:
! 80: ASM_START()
! 81:
! 82: TEXT
! 83: ALIGN(4)
! 84: L(noll):
! 85: .word 0
! 86:
! 87: PROLOGUE(mpn_sqr_diagonal)
! 88: save %sp,-256,%sp
! 89:
! 90: ifdef(`PIC',
! 91: `L(pc): rd %pc,%o7
! 92: ld [%o7+L(noll)-L(pc)],%f8',
! 93: ` sethi %hi(L(noll)),%g1
! 94: ld [%g1+%lo(L(noll))],%f8')
! 95:
! 96: sethi %hi(0xffff0000),%g5
! 97: add %i1,-8,%i1
! 98:
! 99: lduw [%i1+8],%g4
! 100: add %i1,4,%i1 C s1_ptr++
! 101: sllx %g4,16,%g3 C 0000hhhhllll0000
! 102: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 103: subcc %i2,1,%i2
! 104: be,pn %icc,L(end1)
! 105: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 106:
! 107: stx %g2,[%fp+80]
! 108: lduw [%i1+8],%g4
! 109: add %i1,4,%i1 C s1_ptr++
! 110: sllx %g4,16,%g3 C 0000hhhhllll0000
! 111: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 112: subcc %i2,1,%i2
! 113: be,pn %icc,L(end2)
! 114: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 115:
! 116: stx %g2,[%fp+72]
! 117: lduw [%i1+8],%g4
! 118: ld [%i1],%f9
! 119: add %i1,4,%i1 C s1_ptr++
! 120: ldd [%fp+80],%f0
! 121: sllx %g4,16,%g3 C 0000hhhhllll0000
! 122: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 123: subcc %i2,1,%i2
! 124: fxtod %f8,%f2
! 125: be,pn %icc,L(end3)
! 126: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 127:
! 128: stx %g2,[%fp+80]
! 129: fitod %f0,%f4
! 130: lduw [%i1+8],%g4
! 131: fitod %f1,%f6
! 132: fmuld %f2,%f4,%f4
! 133: ld [%i1],%f9
! 134: fmuld %f2,%f6,%f6
! 135: add %i1,4,%i1 C s1_ptr++
! 136: ldd [%fp+72],%f0
! 137: fdtox %f4,%f4
! 138: sllx %g4,16,%g3 C 0000hhhhllll0000
! 139: fdtox %f6,%f6
! 140: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 141: subcc %i2,1,%i2
! 142: std %f4,[%fp-24]
! 143: fxtod %f8,%f2
! 144: std %f6,[%fp-16]
! 145: be,pn %icc,L(end4)
! 146: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 147:
! 148: stx %g2,[%fp+72]
! 149: fitod %f0,%f4
! 150: lduw [%i1+8],%g4
! 151: fitod %f1,%f6
! 152: fmuld %f2,%f4,%f4
! 153: ld [%i1],%f9
! 154: fmuld %f2,%f6,%f6
! 155: add %i1,4,%i1 C s1_ptr++
! 156: ldd [%fp+80],%f0
! 157: fdtox %f4,%f4
! 158: sllx %g4,16,%g3 C 0000hhhhllll0000
! 159: fdtox %f6,%f6
! 160: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 161: subcc %i2,1,%i2
! 162: std %f4,[%fp-40]
! 163: fxtod %f8,%f2
! 164: std %f6,[%fp-32]
! 165: be,pn %icc,L(end5)
! 166: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 167:
! 168: b,a L(loop)
! 169:
! 170: .align 16
! 171: C --- LOOP BEGIN
! 172: L(loop):
! 173: nop
! 174: nop
! 175: stx %g2,[%fp+80]
! 176: fitod %f0,%f4
! 177: C ---
! 178: nop
! 179: nop
! 180: lduw [%i1+8],%g4
! 181: fitod %f1,%f6
! 182: C ---
! 183: nop
! 184: nop
! 185: ldx [%fp-24],%g2 C p16
! 186: fanop
! 187: C ---
! 188: nop
! 189: nop
! 190: ldx [%fp-16],%g1 C p0
! 191: fmuld %f2,%f4,%f4
! 192: C ---
! 193: sllx %g2,16,%g2 C align p16
! 194: add %i0,8,%i0 C res_ptr++
! 195: ld [%i1],%f9
! 196: fmuld %f2,%f6,%f6
! 197: C ---
! 198: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 199: add %i1,4,%i1 C s1_ptr++
! 200: ldd [%fp+72],%f0
! 201: fanop
! 202: C ---
! 203: srlx %g1,32,%l0
! 204: nop
! 205: stw %g1,[%i0-8]
! 206: fdtox %f4,%f4
! 207: C ---
! 208: sllx %g4,16,%g3 C 0000hhhhllll0000
! 209: nop
! 210: stw %l0,[%i0-4]
! 211: fdtox %f6,%f6
! 212: C ---
! 213: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 214: subcc %i2,1,%i2
! 215: std %f4,[%fp-24]
! 216: fxtod %f8,%f2
! 217: C ---
! 218: std %f6,[%fp-16]
! 219: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 220: be,pn %icc,L(loope)
! 221: fanop
! 222: C --- LOOP MIDDLE
! 223: nop
! 224: nop
! 225: stx %g2,[%fp+72]
! 226: fitod %f0,%f4
! 227: C ---
! 228: nop
! 229: nop
! 230: lduw [%i1+8],%g4
! 231: fitod %f1,%f6
! 232: C ---
! 233: nop
! 234: nop
! 235: ldx [%fp-40],%g2 C p16
! 236: fanop
! 237: C ---
! 238: nop
! 239: nop
! 240: ldx [%fp-32],%g1 C p0
! 241: fmuld %f2,%f4,%f4
! 242: C ---
! 243: sllx %g2,16,%g2 C align p16
! 244: add %i0,8,%i0 C res_ptr++
! 245: ld [%i1],%f9
! 246: fmuld %f2,%f6,%f6
! 247: C ---
! 248: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 249: add %i1,4,%i1 C s1_ptr++
! 250: ldd [%fp+80],%f0
! 251: fanop
! 252: C ---
! 253: srlx %g1,32,%l0
! 254: nop
! 255: stw %g1,[%i0-8]
! 256: fdtox %f4,%f4
! 257: C ---
! 258: sllx %g4,16,%g3 C 0000hhhhllll0000
! 259: nop
! 260: stw %l0,[%i0-4]
! 261: fdtox %f6,%f6
! 262: C ---
! 263: or %g3,%g4,%g2 C 0000hhhhXXXXllll
! 264: subcc %i2,1,%i2
! 265: std %f4,[%fp-40]
! 266: fxtod %f8,%f2
! 267: C ---
! 268: std %f6,[%fp-32]
! 269: andn %g2,%g5,%g2 C 0000hhhh0000llll
! 270: bne,pt %icc,L(loop)
! 271: fanop
! 272: C --- LOOP END
! 273:
! 274: L(end5):
! 275: stx %g2,[%fp+80]
! 276: fitod %f0,%f4
! 277: fitod %f1,%f6
! 278: ldx [%fp-24],%g2 C p16
! 279: ldx [%fp-16],%g1 C p0
! 280: fmuld %f2,%f4,%f4
! 281: sllx %g2,16,%g2 C align p16
! 282: add %i0,8,%i0 C res_ptr++
! 283: ld [%i1],%f9
! 284: fmuld %f2,%f6,%f6
! 285: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 286: add %i1,4,%i1 C s1_ptr++
! 287: ldd [%fp+72],%f0
! 288: srlx %g1,32,%l0
! 289: stw %g1,[%i0-8]
! 290: fdtox %f4,%f4
! 291: stw %l0,[%i0-4]
! 292: fdtox %f6,%f6
! 293: std %f4,[%fp-24]
! 294: fxtod %f8,%f2
! 295: std %f6,[%fp-16]
! 296:
! 297: fitod %f0,%f4
! 298: fitod %f1,%f6
! 299: ldx [%fp-40],%g2 C p16
! 300: ldx [%fp-32],%g1 C p0
! 301: fmuld %f2,%f4,%f4
! 302: sllx %g2,16,%g2 C align p16
! 303: add %i0,8,%i0 C res_ptr++
! 304: ld [%i1],%f9
! 305: fmuld %f2,%f6,%f6
! 306: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 307: ldd [%fp+80],%f0
! 308: srlx %g1,32,%l0
! 309: stw %g1,[%i0-8]
! 310: fdtox %f4,%f4
! 311: stw %l0,[%i0-4]
! 312: fdtox %f6,%f6
! 313: std %f4,[%fp-40]
! 314: fxtod %f8,%f2
! 315: std %f6,[%fp-32]
! 316:
! 317: fitod %f0,%f4
! 318: fitod %f1,%f6
! 319: ldx [%fp-24],%g2 C p16
! 320: ldx [%fp-16],%g1 C p0
! 321: fmuld %f2,%f4,%f4
! 322: sllx %g2,16,%g2 C align p16
! 323: add %i0,8,%i0 C res_ptr++
! 324: fmuld %f2,%f6,%f6
! 325: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 326: srlx %g1,32,%l0
! 327: stw %g1,[%i0-8]
! 328: fdtox %f4,%f4
! 329: stw %l0,[%i0-4]
! 330: fdtox %f6,%f6
! 331: std %f4,[%fp-24]
! 332: std %f6,[%fp-16]
! 333:
! 334: ldx [%fp-40],%g2 C p16
! 335: ldx [%fp-32],%g1 C p0
! 336: sllx %g2,16,%g2 C align p16
! 337: add %i0,8,%i0 C res_ptr++
! 338: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 339: srlx %g1,32,%l0
! 340: stw %g1,[%i0-8]
! 341: stw %l0,[%i0-4]
! 342:
! 343: ldx [%fp-24],%g2 C p16
! 344: ldx [%fp-16],%g1 C p0
! 345: sllx %g2,16,%g2 C align p16
! 346: add %i0,8,%i0 C res_ptr++
! 347: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 348: srlx %g1,32,%l0
! 349: stw %g1,[%i0-8]
! 350: stw %l0,[%i0-4]
! 351:
! 352: ret
! 353: restore %g0,%g0,%o0
! 354:
! 355: L(loope):
! 356: stx %g2,[%fp+72]
! 357: fitod %f0,%f4
! 358: fitod %f1,%f6
! 359: ldx [%fp-40],%g2 C p16
! 360: ldx [%fp-32],%g1 C p0
! 361: fmuld %f2,%f4,%f4
! 362: sllx %g2,16,%g2 C align p16
! 363: add %i0,8,%i0 C res_ptr++
! 364: ld [%i1],%f9
! 365: fmuld %f2,%f6,%f6
! 366: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 367: add %i1,4,%i1 C s1_ptr++
! 368: ldd [%fp+80],%f0
! 369: srlx %g1,32,%l0
! 370: stw %g1,[%i0-8]
! 371: fdtox %f4,%f4
! 372: stw %l0,[%i0-4]
! 373: fdtox %f6,%f6
! 374: std %f4,[%fp-40]
! 375: fxtod %f8,%f2
! 376: std %f6,[%fp-32]
! 377:
! 378: fitod %f0,%f4
! 379: fitod %f1,%f6
! 380: ldx [%fp-24],%g2 C p16
! 381: ldx [%fp-16],%g1 C p0
! 382: fmuld %f2,%f4,%f4
! 383: sllx %g2,16,%g2 C align p16
! 384: add %i0,8,%i0 C res_ptr++
! 385: ld [%i1],%f9
! 386: fmuld %f2,%f6,%f6
! 387: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 388: ldd [%fp+72],%f0
! 389: srlx %g1,32,%l0
! 390: stw %g1,[%i0-8]
! 391: fdtox %f4,%f4
! 392: stw %l0,[%i0-4]
! 393: fdtox %f6,%f6
! 394: std %f4,[%fp-24]
! 395: fxtod %f8,%f2
! 396: std %f6,[%fp-16]
! 397:
! 398: fitod %f0,%f4
! 399: fitod %f1,%f6
! 400: ldx [%fp-40],%g2 C p16
! 401: ldx [%fp-32],%g1 C p0
! 402: fmuld %f2,%f4,%f4
! 403: sllx %g2,16,%g2 C align p16
! 404: add %i0,8,%i0 C res_ptr++
! 405: fmuld %f2,%f6,%f6
! 406: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 407: srlx %g1,32,%l0
! 408: stw %g1,[%i0-8]
! 409: fdtox %f4,%f4
! 410: stw %l0,[%i0-4]
! 411: fdtox %f6,%f6
! 412: std %f4,[%fp-40]
! 413: std %f6,[%fp-32]
! 414:
! 415: ldx [%fp-24],%g2 C p16
! 416: ldx [%fp-16],%g1 C p0
! 417: sllx %g2,16,%g2 C align p16
! 418: add %i0,8,%i0 C res_ptr++
! 419: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 420: srlx %g1,32,%l0
! 421: stw %g1,[%i0-8]
! 422: stw %l0,[%i0-4]
! 423:
! 424: ldx [%fp-40],%g2 C p16
! 425: ldx [%fp-32],%g1 C p0
! 426: sllx %g2,16,%g2 C align p16
! 427: add %i0,8,%i0 C res_ptr++
! 428: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 429: srlx %g1,32,%l0
! 430: stw %g1,[%i0-8]
! 431: stw %l0,[%i0-4]
! 432:
! 433: ret
! 434: restore %g0,%g0,%o0
! 435:
! 436: L(end1):
! 437: add %i1,4,%i1 C s1_ptr++
! 438: stx %g2,[%fp+80]
! 439: ld [%i1],%f9
! 440: ldd [%fp+80],%f0
! 441: fxtod %f8,%f2
! 442: fitod %f0,%f4
! 443: fitod %f1,%f6
! 444: fmuld %f2,%f4,%f4
! 445: fmuld %f2,%f6,%f6
! 446: fdtox %f4,%f4
! 447: fdtox %f6,%f6
! 448: std %f4,[%fp-24]
! 449: std %f6,[%fp-16]
! 450: ldx [%fp-24],%g2 C p16
! 451: ldx [%fp-16],%g1 C p0
! 452: sllx %g2,16,%g2 C align p16
! 453: add %i0,8,%i0 C res_ptr++
! 454: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 455: srlx %g1,32,%l0
! 456: stw %g1,[%i0-8]
! 457: stw %l0,[%i0-4]
! 458: ret
! 459: restore %g0,%g0,%o0
! 460:
! 461: L(end2):
! 462: stx %g2,[%fp+72]
! 463: ld [%i1],%f9
! 464: add %i1,4,%i1 C s1_ptr++
! 465: ldd [%fp+80],%f0
! 466: fxtod %f8,%f2
! 467: fitod %f0,%f4
! 468: fitod %f1,%f6
! 469: fmuld %f2,%f4,%f4
! 470: ld [%i1],%f9
! 471: fmuld %f2,%f6,%f6
! 472: ldd [%fp+72],%f0
! 473: fdtox %f4,%f4
! 474: fdtox %f6,%f6
! 475: std %f4,[%fp-24]
! 476: fxtod %f8,%f2
! 477: std %f6,[%fp-16]
! 478: fitod %f0,%f4
! 479: fitod %f1,%f6
! 480: fmuld %f2,%f4,%f4
! 481: fmuld %f2,%f6,%f6
! 482: fdtox %f4,%f4
! 483: fdtox %f6,%f6
! 484: std %f4,[%fp-40]
! 485: std %f6,[%fp-32]
! 486: ldx [%fp-24],%g2 C p16
! 487: ldx [%fp-16],%g1 C p0
! 488: sllx %g2,16,%g2 C align p16
! 489: add %i0,8,%i0 C res_ptr++
! 490: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 491: srlx %g1,32,%l0
! 492: stw %g1,[%i0-8]
! 493: stw %l0,[%i0-4]
! 494: ldx [%fp-40],%g2 C p16
! 495: ldx [%fp-32],%g1 C p0
! 496: sllx %g2,16,%g2 C align p16
! 497: add %i0,8,%i0 C res_ptr++
! 498: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 499: srlx %g1,32,%l0
! 500: stw %g1,[%i0-8]
! 501: stw %l0,[%i0-4]
! 502: ret
! 503: restore %g0,%g0,%o0
! 504:
! 505: L(end3):
! 506: stx %g2,[%fp+80]
! 507: fitod %f0,%f4
! 508: fitod %f1,%f6
! 509: fmuld %f2,%f4,%f4
! 510: ld [%i1],%f9
! 511: fmuld %f2,%f6,%f6
! 512: add %i1,4,%i1 C s1_ptr++
! 513: ldd [%fp+72],%f0
! 514: fdtox %f4,%f4
! 515: fdtox %f6,%f6
! 516: std %f4,[%fp-24]
! 517: fxtod %f8,%f2
! 518: std %f6,[%fp-16]
! 519: fitod %f0,%f4
! 520: fitod %f1,%f6
! 521: fmuld %f2,%f4,%f4
! 522: ld [%i1],%f9
! 523: fmuld %f2,%f6,%f6
! 524: ldd [%fp+80],%f0
! 525: fdtox %f4,%f4
! 526: fdtox %f6,%f6
! 527: std %f4,[%fp-40]
! 528: fxtod %f8,%f2
! 529: std %f6,[%fp-32]
! 530: fitod %f0,%f4
! 531: fitod %f1,%f6
! 532: ldx [%fp-24],%g2 C p16
! 533: ldx [%fp-16],%g1 C p0
! 534: fmuld %f2,%f4,%f4
! 535: sllx %g2,16,%g2 C align p16
! 536: add %i0,8,%i0 C res_ptr++
! 537: fmuld %f2,%f6,%f6
! 538: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 539: srlx %g1,32,%l0
! 540: stw %g1,[%i0-8]
! 541: fdtox %f4,%f4
! 542: stw %l0,[%i0-4]
! 543: fdtox %f6,%f6
! 544: std %f4,[%fp-24]
! 545: std %f6,[%fp-16]
! 546: ldx [%fp-40],%g2 C p16
! 547: ldx [%fp-32],%g1 C p0
! 548: sllx %g2,16,%g2 C align p16
! 549: add %i0,8,%i0 C res_ptr++
! 550: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 551: srlx %g1,32,%l0
! 552: stw %g1,[%i0-8]
! 553: stw %l0,[%i0-4]
! 554: ldx [%fp-24],%g2 C p16
! 555: ldx [%fp-16],%g1 C p0
! 556: sllx %g2,16,%g2 C align p16
! 557: add %i0,8,%i0 C res_ptr++
! 558: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 559: srlx %g1,32,%l0
! 560: stw %g1,[%i0-8]
! 561: stw %l0,[%i0-4]
! 562: ret
! 563: restore %g0,%g0,%o0
! 564:
! 565: L(end4):
! 566: stx %g2,[%fp+72]
! 567: fitod %f0,%f4
! 568: fitod %f1,%f6
! 569: fmuld %f2,%f4,%f4
! 570: ld [%i1],%f9
! 571: fmuld %f2,%f6,%f6
! 572: add %i1,4,%i1 C s1_ptr++
! 573: ldd [%fp+80],%f0
! 574: fdtox %f4,%f4
! 575: fdtox %f6,%f6
! 576: std %f4,[%fp-40]
! 577: fxtod %f8,%f2
! 578: std %f6,[%fp-32]
! 579: fitod %f0,%f4
! 580: fitod %f1,%f6
! 581: ldx [%fp-24],%g2 C p16
! 582: ldx [%fp-16],%g1 C p0
! 583: fmuld %f2,%f4,%f4
! 584: sllx %g2,16,%g2 C align p16
! 585: add %i0,8,%i0 C res_ptr++
! 586: ld [%i1],%f9
! 587: fmuld %f2,%f6,%f6
! 588: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 589: ldd [%fp+72],%f0
! 590: srlx %g1,32,%l0
! 591: stw %g1,[%i0-8]
! 592: fdtox %f4,%f4
! 593: stw %l0,[%i0-4]
! 594: fdtox %f6,%f6
! 595: std %f4,[%fp-24]
! 596: fxtod %f8,%f2
! 597: std %f6,[%fp-16]
! 598: fitod %f0,%f4
! 599: fitod %f1,%f6
! 600: ldx [%fp-40],%g2 C p16
! 601: ldx [%fp-32],%g1 C p0
! 602: fmuld %f2,%f4,%f4
! 603: sllx %g2,16,%g2 C align p16
! 604: add %i0,8,%i0 C res_ptr++
! 605: fmuld %f2,%f6,%f6
! 606: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 607: srlx %g1,32,%l0
! 608: stw %g1,[%i0-8]
! 609: fdtox %f4,%f4
! 610: stw %l0,[%i0-4]
! 611: fdtox %f6,%f6
! 612: std %f4,[%fp-40]
! 613: std %f6,[%fp-32]
! 614: ldx [%fp-24],%g2 C p16
! 615: ldx [%fp-16],%g1 C p0
! 616: sllx %g2,16,%g2 C align p16
! 617: add %i0,8,%i0 C res_ptr++
! 618: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 619: srlx %g1,32,%l0
! 620: stw %g1,[%i0-8]
! 621: stw %l0,[%i0-4]
! 622: ldx [%fp-40],%g2 C p16
! 623: ldx [%fp-32],%g1 C p0
! 624: sllx %g2,16,%g2 C align p16
! 625: add %i0,8,%i0 C res_ptr++
! 626: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
! 627: srlx %g1,32,%l0
! 628: stw %g1,[%i0-8]
! 629: stw %l0,[%i0-4]
! 630: ret
! 631: restore %g0,%g0,%o0
! 632: EPILOGUE(mpn_sqr_diagonal)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>