Annotation of OpenXM_contrib/gmp/mpn/sparc32/v9/sqr_diagonal.asm, Revision 1.1.1.1
1.1 ohara 1: dnl SPARC v9 32-bit mpn_sqr_diagonal.
2:
3: dnl Copyright 2001 Free Software Foundation, Inc.
4:
5: dnl This file is part of the GNU MP Library.
6:
7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
8: dnl it under the terms of the GNU Lesser General Public License as published
9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
10: dnl your option) any later version.
11:
12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: dnl License for more details.
16:
17: dnl You should have received a copy of the GNU Lesser General Public License
18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: dnl MA 02111-1307, USA.
21:
22:
23: include(`../config.m4')
24:
25: C INPUT PARAMETERS
26: C rp i0
27: C up i1
28: C n i2
29:
30: C This code uses a very deep software pipeline, due to the need for moving data
31: C forth and back between the integer registers and floating-point registers.
32: C
33: C The code is very large, probably unnecessarily large. Cross-jumping
34: C transformation of the wind-down code could reduce the code size considerably.
35: C
36: C A VIS variant of this code would make the pipeline less deep, since the
37: C masking now done in the integer unit could take place in the floating-point
38: C unit using the FAND instruction. It would be possible to save several cycles
39: C too.
40: C
41: C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
42: C not much slower from the Ecache. It would perhaps be possible to shave off
43: C one cycle, but not easily. We cannot do better than 10 cycles/limb with the
44: C used instructions, since we have 10 memory operations per limb. But a VIS
45: C variant could run three cycles faster than the corresponding non-VIS code.
46:
47: C This is non-pipelined code showing the algorithm:
48: C
49: C L(loop):
50: C lduw [up+0],%g4 C 00000000hhhhllll
51: C sllx %g4,16,%g3 C 0000hhhhllll0000
52: C or %g3,%g4,%g2 C 0000hhhhXXXXllll
53: C andn %g2,%g5,%g2 C 0000hhhh0000llll
54: C stx %g2,[%fp+80]
55: C ldd [%fp+80],%f0
56: C fitod %f0,%f4 C hi16
57: C fitod %f1,%f6 C lo16
58: C ld [up+0],%f9
59: C fxtod %f8,%f2
60: C fmuld %f2,%f4,%f4
61: C fmuld %f2,%f6,%f6
62: C fdtox %f4,%f4
63: C fdtox %f6,%f6
64: C std %f4,[%fp-24]
65: C std %f6,[%fp-16]
66: C ldx [%fp-24],%g2
67: C ldx [%fp-16],%g1
68: C sllx %g2,16,%g2
69: C add %g2,%g1,%g1
70: C stw %g1,[rp+0]
71: C srlx %g1,32,%l0
72: C stw %l0,[rp+4]
73: C add up,4,up
74: C subcc n,1,n
75: C bne,pt %icc,L(loop)
76: C add rp,8,rp
77:
78: define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe
79:
80: ASM_START()
81:
82: TEXT
83: ALIGN(4)
84: L(noll):
85: .word 0
86:
87: PROLOGUE(mpn_sqr_diagonal)
88: save %sp,-256,%sp
89:
90: ifdef(`PIC',
91: `L(pc): rd %pc,%o7
92: ld [%o7+L(noll)-L(pc)],%f8',
93: ` sethi %hi(L(noll)),%g1
94: ld [%g1+%lo(L(noll))],%f8')
95:
96: sethi %hi(0xffff0000),%g5
97: add %i1,-8,%i1
98:
99: lduw [%i1+8],%g4
100: add %i1,4,%i1 C s1_ptr++
101: sllx %g4,16,%g3 C 0000hhhhllll0000
102: or %g3,%g4,%g2 C 0000hhhhXXXXllll
103: subcc %i2,1,%i2
104: be,pn %icc,L(end1)
105: andn %g2,%g5,%g2 C 0000hhhh0000llll
106:
107: stx %g2,[%fp+80]
108: lduw [%i1+8],%g4
109: add %i1,4,%i1 C s1_ptr++
110: sllx %g4,16,%g3 C 0000hhhhllll0000
111: or %g3,%g4,%g2 C 0000hhhhXXXXllll
112: subcc %i2,1,%i2
113: be,pn %icc,L(end2)
114: andn %g2,%g5,%g2 C 0000hhhh0000llll
115:
116: stx %g2,[%fp+72]
117: lduw [%i1+8],%g4
118: ld [%i1],%f9
119: add %i1,4,%i1 C s1_ptr++
120: ldd [%fp+80],%f0
121: sllx %g4,16,%g3 C 0000hhhhllll0000
122: or %g3,%g4,%g2 C 0000hhhhXXXXllll
123: subcc %i2,1,%i2
124: fxtod %f8,%f2
125: be,pn %icc,L(end3)
126: andn %g2,%g5,%g2 C 0000hhhh0000llll
127:
128: stx %g2,[%fp+80]
129: fitod %f0,%f4
130: lduw [%i1+8],%g4
131: fitod %f1,%f6
132: fmuld %f2,%f4,%f4
133: ld [%i1],%f9
134: fmuld %f2,%f6,%f6
135: add %i1,4,%i1 C s1_ptr++
136: ldd [%fp+72],%f0
137: fdtox %f4,%f4
138: sllx %g4,16,%g3 C 0000hhhhllll0000
139: fdtox %f6,%f6
140: or %g3,%g4,%g2 C 0000hhhhXXXXllll
141: subcc %i2,1,%i2
142: std %f4,[%fp-24]
143: fxtod %f8,%f2
144: std %f6,[%fp-16]
145: be,pn %icc,L(end4)
146: andn %g2,%g5,%g2 C 0000hhhh0000llll
147:
148: stx %g2,[%fp+72]
149: fitod %f0,%f4
150: lduw [%i1+8],%g4
151: fitod %f1,%f6
152: fmuld %f2,%f4,%f4
153: ld [%i1],%f9
154: fmuld %f2,%f6,%f6
155: add %i1,4,%i1 C s1_ptr++
156: ldd [%fp+80],%f0
157: fdtox %f4,%f4
158: sllx %g4,16,%g3 C 0000hhhhllll0000
159: fdtox %f6,%f6
160: or %g3,%g4,%g2 C 0000hhhhXXXXllll
161: subcc %i2,1,%i2
162: std %f4,[%fp-40]
163: fxtod %f8,%f2
164: std %f6,[%fp-32]
165: be,pn %icc,L(end5)
166: andn %g2,%g5,%g2 C 0000hhhh0000llll
167:
168: b,a L(loop)
169:
170: .align 16
171: C --- LOOP BEGIN
172: L(loop):
173: nop
174: nop
175: stx %g2,[%fp+80]
176: fitod %f0,%f4
177: C ---
178: nop
179: nop
180: lduw [%i1+8],%g4
181: fitod %f1,%f6
182: C ---
183: nop
184: nop
185: ldx [%fp-24],%g2 C p16
186: fanop
187: C ---
188: nop
189: nop
190: ldx [%fp-16],%g1 C p0
191: fmuld %f2,%f4,%f4
192: C ---
193: sllx %g2,16,%g2 C align p16
194: add %i0,8,%i0 C res_ptr++
195: ld [%i1],%f9
196: fmuld %f2,%f6,%f6
197: C ---
198: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
199: add %i1,4,%i1 C s1_ptr++
200: ldd [%fp+72],%f0
201: fanop
202: C ---
203: srlx %g1,32,%l0
204: nop
205: stw %g1,[%i0-8]
206: fdtox %f4,%f4
207: C ---
208: sllx %g4,16,%g3 C 0000hhhhllll0000
209: nop
210: stw %l0,[%i0-4]
211: fdtox %f6,%f6
212: C ---
213: or %g3,%g4,%g2 C 0000hhhhXXXXllll
214: subcc %i2,1,%i2
215: std %f4,[%fp-24]
216: fxtod %f8,%f2
217: C ---
218: std %f6,[%fp-16]
219: andn %g2,%g5,%g2 C 0000hhhh0000llll
220: be,pn %icc,L(loope)
221: fanop
222: C --- LOOP MIDDLE
223: nop
224: nop
225: stx %g2,[%fp+72]
226: fitod %f0,%f4
227: C ---
228: nop
229: nop
230: lduw [%i1+8],%g4
231: fitod %f1,%f6
232: C ---
233: nop
234: nop
235: ldx [%fp-40],%g2 C p16
236: fanop
237: C ---
238: nop
239: nop
240: ldx [%fp-32],%g1 C p0
241: fmuld %f2,%f4,%f4
242: C ---
243: sllx %g2,16,%g2 C align p16
244: add %i0,8,%i0 C res_ptr++
245: ld [%i1],%f9
246: fmuld %f2,%f6,%f6
247: C ---
248: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
249: add %i1,4,%i1 C s1_ptr++
250: ldd [%fp+80],%f0
251: fanop
252: C ---
253: srlx %g1,32,%l0
254: nop
255: stw %g1,[%i0-8]
256: fdtox %f4,%f4
257: C ---
258: sllx %g4,16,%g3 C 0000hhhhllll0000
259: nop
260: stw %l0,[%i0-4]
261: fdtox %f6,%f6
262: C ---
263: or %g3,%g4,%g2 C 0000hhhhXXXXllll
264: subcc %i2,1,%i2
265: std %f4,[%fp-40]
266: fxtod %f8,%f2
267: C ---
268: std %f6,[%fp-32]
269: andn %g2,%g5,%g2 C 0000hhhh0000llll
270: bne,pt %icc,L(loop)
271: fanop
272: C --- LOOP END
273:
274: L(end5):
275: stx %g2,[%fp+80]
276: fitod %f0,%f4
277: fitod %f1,%f6
278: ldx [%fp-24],%g2 C p16
279: ldx [%fp-16],%g1 C p0
280: fmuld %f2,%f4,%f4
281: sllx %g2,16,%g2 C align p16
282: add %i0,8,%i0 C res_ptr++
283: ld [%i1],%f9
284: fmuld %f2,%f6,%f6
285: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
286: add %i1,4,%i1 C s1_ptr++
287: ldd [%fp+72],%f0
288: srlx %g1,32,%l0
289: stw %g1,[%i0-8]
290: fdtox %f4,%f4
291: stw %l0,[%i0-4]
292: fdtox %f6,%f6
293: std %f4,[%fp-24]
294: fxtod %f8,%f2
295: std %f6,[%fp-16]
296:
297: fitod %f0,%f4
298: fitod %f1,%f6
299: ldx [%fp-40],%g2 C p16
300: ldx [%fp-32],%g1 C p0
301: fmuld %f2,%f4,%f4
302: sllx %g2,16,%g2 C align p16
303: add %i0,8,%i0 C res_ptr++
304: ld [%i1],%f9
305: fmuld %f2,%f6,%f6
306: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
307: ldd [%fp+80],%f0
308: srlx %g1,32,%l0
309: stw %g1,[%i0-8]
310: fdtox %f4,%f4
311: stw %l0,[%i0-4]
312: fdtox %f6,%f6
313: std %f4,[%fp-40]
314: fxtod %f8,%f2
315: std %f6,[%fp-32]
316:
317: fitod %f0,%f4
318: fitod %f1,%f6
319: ldx [%fp-24],%g2 C p16
320: ldx [%fp-16],%g1 C p0
321: fmuld %f2,%f4,%f4
322: sllx %g2,16,%g2 C align p16
323: add %i0,8,%i0 C res_ptr++
324: fmuld %f2,%f6,%f6
325: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
326: srlx %g1,32,%l0
327: stw %g1,[%i0-8]
328: fdtox %f4,%f4
329: stw %l0,[%i0-4]
330: fdtox %f6,%f6
331: std %f4,[%fp-24]
332: std %f6,[%fp-16]
333:
334: ldx [%fp-40],%g2 C p16
335: ldx [%fp-32],%g1 C p0
336: sllx %g2,16,%g2 C align p16
337: add %i0,8,%i0 C res_ptr++
338: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
339: srlx %g1,32,%l0
340: stw %g1,[%i0-8]
341: stw %l0,[%i0-4]
342:
343: ldx [%fp-24],%g2 C p16
344: ldx [%fp-16],%g1 C p0
345: sllx %g2,16,%g2 C align p16
346: add %i0,8,%i0 C res_ptr++
347: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
348: srlx %g1,32,%l0
349: stw %g1,[%i0-8]
350: stw %l0,[%i0-4]
351:
352: ret
353: restore %g0,%g0,%o0
354:
355: L(loope):
356: stx %g2,[%fp+72]
357: fitod %f0,%f4
358: fitod %f1,%f6
359: ldx [%fp-40],%g2 C p16
360: ldx [%fp-32],%g1 C p0
361: fmuld %f2,%f4,%f4
362: sllx %g2,16,%g2 C align p16
363: add %i0,8,%i0 C res_ptr++
364: ld [%i1],%f9
365: fmuld %f2,%f6,%f6
366: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
367: add %i1,4,%i1 C s1_ptr++
368: ldd [%fp+80],%f0
369: srlx %g1,32,%l0
370: stw %g1,[%i0-8]
371: fdtox %f4,%f4
372: stw %l0,[%i0-4]
373: fdtox %f6,%f6
374: std %f4,[%fp-40]
375: fxtod %f8,%f2
376: std %f6,[%fp-32]
377:
378: fitod %f0,%f4
379: fitod %f1,%f6
380: ldx [%fp-24],%g2 C p16
381: ldx [%fp-16],%g1 C p0
382: fmuld %f2,%f4,%f4
383: sllx %g2,16,%g2 C align p16
384: add %i0,8,%i0 C res_ptr++
385: ld [%i1],%f9
386: fmuld %f2,%f6,%f6
387: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
388: ldd [%fp+72],%f0
389: srlx %g1,32,%l0
390: stw %g1,[%i0-8]
391: fdtox %f4,%f4
392: stw %l0,[%i0-4]
393: fdtox %f6,%f6
394: std %f4,[%fp-24]
395: fxtod %f8,%f2
396: std %f6,[%fp-16]
397:
398: fitod %f0,%f4
399: fitod %f1,%f6
400: ldx [%fp-40],%g2 C p16
401: ldx [%fp-32],%g1 C p0
402: fmuld %f2,%f4,%f4
403: sllx %g2,16,%g2 C align p16
404: add %i0,8,%i0 C res_ptr++
405: fmuld %f2,%f6,%f6
406: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
407: srlx %g1,32,%l0
408: stw %g1,[%i0-8]
409: fdtox %f4,%f4
410: stw %l0,[%i0-4]
411: fdtox %f6,%f6
412: std %f4,[%fp-40]
413: std %f6,[%fp-32]
414:
415: ldx [%fp-24],%g2 C p16
416: ldx [%fp-16],%g1 C p0
417: sllx %g2,16,%g2 C align p16
418: add %i0,8,%i0 C res_ptr++
419: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
420: srlx %g1,32,%l0
421: stw %g1,[%i0-8]
422: stw %l0,[%i0-4]
423:
424: ldx [%fp-40],%g2 C p16
425: ldx [%fp-32],%g1 C p0
426: sllx %g2,16,%g2 C align p16
427: add %i0,8,%i0 C res_ptr++
428: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
429: srlx %g1,32,%l0
430: stw %g1,[%i0-8]
431: stw %l0,[%i0-4]
432:
433: ret
434: restore %g0,%g0,%o0
435:
436: L(end1):
437: add %i1,4,%i1 C s1_ptr++
438: stx %g2,[%fp+80]
439: ld [%i1],%f9
440: ldd [%fp+80],%f0
441: fxtod %f8,%f2
442: fitod %f0,%f4
443: fitod %f1,%f6
444: fmuld %f2,%f4,%f4
445: fmuld %f2,%f6,%f6
446: fdtox %f4,%f4
447: fdtox %f6,%f6
448: std %f4,[%fp-24]
449: std %f6,[%fp-16]
450: ldx [%fp-24],%g2 C p16
451: ldx [%fp-16],%g1 C p0
452: sllx %g2,16,%g2 C align p16
453: add %i0,8,%i0 C res_ptr++
454: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
455: srlx %g1,32,%l0
456: stw %g1,[%i0-8]
457: stw %l0,[%i0-4]
458: ret
459: restore %g0,%g0,%o0
460:
461: L(end2):
462: stx %g2,[%fp+72]
463: ld [%i1],%f9
464: add %i1,4,%i1 C s1_ptr++
465: ldd [%fp+80],%f0
466: fxtod %f8,%f2
467: fitod %f0,%f4
468: fitod %f1,%f6
469: fmuld %f2,%f4,%f4
470: ld [%i1],%f9
471: fmuld %f2,%f6,%f6
472: ldd [%fp+72],%f0
473: fdtox %f4,%f4
474: fdtox %f6,%f6
475: std %f4,[%fp-24]
476: fxtod %f8,%f2
477: std %f6,[%fp-16]
478: fitod %f0,%f4
479: fitod %f1,%f6
480: fmuld %f2,%f4,%f4
481: fmuld %f2,%f6,%f6
482: fdtox %f4,%f4
483: fdtox %f6,%f6
484: std %f4,[%fp-40]
485: std %f6,[%fp-32]
486: ldx [%fp-24],%g2 C p16
487: ldx [%fp-16],%g1 C p0
488: sllx %g2,16,%g2 C align p16
489: add %i0,8,%i0 C res_ptr++
490: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
491: srlx %g1,32,%l0
492: stw %g1,[%i0-8]
493: stw %l0,[%i0-4]
494: ldx [%fp-40],%g2 C p16
495: ldx [%fp-32],%g1 C p0
496: sllx %g2,16,%g2 C align p16
497: add %i0,8,%i0 C res_ptr++
498: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
499: srlx %g1,32,%l0
500: stw %g1,[%i0-8]
501: stw %l0,[%i0-4]
502: ret
503: restore %g0,%g0,%o0
504:
505: L(end3):
506: stx %g2,[%fp+80]
507: fitod %f0,%f4
508: fitod %f1,%f6
509: fmuld %f2,%f4,%f4
510: ld [%i1],%f9
511: fmuld %f2,%f6,%f6
512: add %i1,4,%i1 C s1_ptr++
513: ldd [%fp+72],%f0
514: fdtox %f4,%f4
515: fdtox %f6,%f6
516: std %f4,[%fp-24]
517: fxtod %f8,%f2
518: std %f6,[%fp-16]
519: fitod %f0,%f4
520: fitod %f1,%f6
521: fmuld %f2,%f4,%f4
522: ld [%i1],%f9
523: fmuld %f2,%f6,%f6
524: ldd [%fp+80],%f0
525: fdtox %f4,%f4
526: fdtox %f6,%f6
527: std %f4,[%fp-40]
528: fxtod %f8,%f2
529: std %f6,[%fp-32]
530: fitod %f0,%f4
531: fitod %f1,%f6
532: ldx [%fp-24],%g2 C p16
533: ldx [%fp-16],%g1 C p0
534: fmuld %f2,%f4,%f4
535: sllx %g2,16,%g2 C align p16
536: add %i0,8,%i0 C res_ptr++
537: fmuld %f2,%f6,%f6
538: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
539: srlx %g1,32,%l0
540: stw %g1,[%i0-8]
541: fdtox %f4,%f4
542: stw %l0,[%i0-4]
543: fdtox %f6,%f6
544: std %f4,[%fp-24]
545: std %f6,[%fp-16]
546: ldx [%fp-40],%g2 C p16
547: ldx [%fp-32],%g1 C p0
548: sllx %g2,16,%g2 C align p16
549: add %i0,8,%i0 C res_ptr++
550: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
551: srlx %g1,32,%l0
552: stw %g1,[%i0-8]
553: stw %l0,[%i0-4]
554: ldx [%fp-24],%g2 C p16
555: ldx [%fp-16],%g1 C p0
556: sllx %g2,16,%g2 C align p16
557: add %i0,8,%i0 C res_ptr++
558: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
559: srlx %g1,32,%l0
560: stw %g1,[%i0-8]
561: stw %l0,[%i0-4]
562: ret
563: restore %g0,%g0,%o0
564:
565: L(end4):
566: stx %g2,[%fp+72]
567: fitod %f0,%f4
568: fitod %f1,%f6
569: fmuld %f2,%f4,%f4
570: ld [%i1],%f9
571: fmuld %f2,%f6,%f6
572: add %i1,4,%i1 C s1_ptr++
573: ldd [%fp+80],%f0
574: fdtox %f4,%f4
575: fdtox %f6,%f6
576: std %f4,[%fp-40]
577: fxtod %f8,%f2
578: std %f6,[%fp-32]
579: fitod %f0,%f4
580: fitod %f1,%f6
581: ldx [%fp-24],%g2 C p16
582: ldx [%fp-16],%g1 C p0
583: fmuld %f2,%f4,%f4
584: sllx %g2,16,%g2 C align p16
585: add %i0,8,%i0 C res_ptr++
586: ld [%i1],%f9
587: fmuld %f2,%f6,%f6
588: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
589: ldd [%fp+72],%f0
590: srlx %g1,32,%l0
591: stw %g1,[%i0-8]
592: fdtox %f4,%f4
593: stw %l0,[%i0-4]
594: fdtox %f6,%f6
595: std %f4,[%fp-24]
596: fxtod %f8,%f2
597: std %f6,[%fp-16]
598: fitod %f0,%f4
599: fitod %f1,%f6
600: ldx [%fp-40],%g2 C p16
601: ldx [%fp-32],%g1 C p0
602: fmuld %f2,%f4,%f4
603: sllx %g2,16,%g2 C align p16
604: add %i0,8,%i0 C res_ptr++
605: fmuld %f2,%f6,%f6
606: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
607: srlx %g1,32,%l0
608: stw %g1,[%i0-8]
609: fdtox %f4,%f4
610: stw %l0,[%i0-4]
611: fdtox %f6,%f6
612: std %f4,[%fp-40]
613: std %f6,[%fp-32]
614: ldx [%fp-24],%g2 C p16
615: ldx [%fp-16],%g1 C p0
616: sllx %g2,16,%g2 C align p16
617: add %i0,8,%i0 C res_ptr++
618: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
619: srlx %g1,32,%l0
620: stw %g1,[%i0-8]
621: stw %l0,[%i0-4]
622: ldx [%fp-40],%g2 C p16
623: ldx [%fp-32],%g1 C p0
624: sllx %g2,16,%g2 C align p16
625: add %i0,8,%i0 C res_ptr++
626: add %g2,%g1,%g1 C add p16 to p0 (ADD1)
627: srlx %g1,32,%l0
628: stw %g1,[%i0-8]
629: stw %l0,[%i0-4]
630: ret
631: restore %g0,%g0,%o0
632: EPILOGUE(mpn_sqr_diagonal)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>