Annotation of OpenXM_contrib/gmp/mpn/ia64/lorrshift.asm, Revision 1.1.1.1
1.1 ohara 1: dnl IA-64 mpn_Xshift.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4:
5: dnl This file is part of the GNU MP Library.
6:
7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
8: dnl it under the terms of the GNU Lesser General Public License as published
9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
10: dnl your option) any later version.
11:
12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: dnl License for more details.
16:
17: dnl You should have received a copy of the GNU Lesser General Public License
18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: dnl MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: C This code runs at 2 cycles/limb for large operands on the Itanium. It needs
25: C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The
26: C main loop here is not great; it is oversheduled with respect to the shr.u
27: C instructions, and this actually turns out to give considerably more complex
28: C wind down code. The code runs slowly for operands with <= 8 limbs, since we
29: C have a non-scheduled loop for that case. We also have a primitive loop for
30: C the unrolling edge, and as a consequence of the main loop stupidity it is
31: C executed 1-4 steps instead of 0-3 steps.
32:
33: C By having 63 separate loops using the shrp instruction, we could easily reach
34: C 1 cycle/limb. Such loops would require a less deep software pipeline, since
35: C shrp unlike shl/shr.u have a plain one cycle latency.
36:
37: C INPUT PARAMETERS
38: C rp = r32
39: C sp = r33
40: C n = r34
41: C cnt = r35
42:
43: ifdef(`OPERATION_lshift',`
44: define(`FSH',`shl')
45: define(`BSH',`shr.u')
46: define(`UPD',`-8')
47: define(`func',`mpn_lshift')
48: ')
49: ifdef(`OPERATION_rshift',`
50: define(`FSH',`shr.u')
51: define(`BSH',`shl')
52: define(`UPD',`8')
53: define(`func',`mpn_rshift')
54: ')
55:
56: ASM_START()
57: PROLOGUE(func)
58: .prologue
59: ifdef(`HAVE_ABI_32',
60: ` addp4 r32 = 0, r32
61: addp4 r33 = 0, r33
62: sxt4 r34 = r34
63: zxt4 r35 = r35
64: ;;
65: ')
66: add r34 = -1, r34
67: sub r31 = 64, r35
68: .save ar.lc, r2
69: mov r2 = ar.lc;;
70: .body
71: cmp.leu p6, p7 = 8,r34
72: ifdef(`OPERATION_lshift',`
73: shladd r33 = r34, 3, r33
74: shladd r32 = r34, 3, r32;;
75: ')
76: ld8 r19 = [r33], UPD ;;
77: BSH r8 = r19, r31 C function return value
78: (p6) br.dptk .Lbig
79:
80: C
81: C Code for small operands. Not an optimization for the Itanium, it is here
82: C just to simplify the general case.
83: C
84: mov ar.lc = r34;;
85: br.cloop.dptk .Loops
86: FSH r26 = r19, r35 ;;
87: st8 [r32] = r26
88: mov ar.lc = r2
89: br.ret.sptk.many b0
90: .Loops:
91: ld8 r16 = [r33], UPD
92: FSH r26 = r19, r35 ;;
93: BSH r27 = r16, r31 ;;
94: { .mib; nop.b 0;; } C delay to save 6 cycles...
95: { .mib; nop.b 0;; } C delay to save 6 cycles...
96: { .mib; nop.b 0;; } C delay to save 6 cycles...
97: or r27 = r27, r26
98: mov r19 = r16 ;;
99: st8 [r32] = r27, UPD
100: br.cloop.dptk .Loops
101: FSH r26 = r19, r35 ;;
102: st8 [r32] = r26
103: mov ar.lc = r2
104: br.ret.sptk.many b0
105:
106: C
107: C Code for operands with >8 limbs. An edge loop and a very deep software
108: C pipeline.
109: C
110: .Lbig: and r15 = 3, r34
111: shr.u r14 = r34, 2 ;;
112: mov ar.lc = r15
113: .Loop0:
114: ld8 r16 = [r33], UPD
115: FSH r26 = r19, r35 ;;
116: BSH r27 = r16, r31 ;;
117: { .mib; nop.b 0;; } C delay to save 6 cycles...
118: { .mib; nop.b 0;; } C delay to save 6 cycles...
119: { .mib; nop.b 0;; } C delay to save 6 cycles...
120: or r27 = r27, r26
121: mov r19 = r16 ;;
122: st8 [r32] = r27, UPD
123: br.cloop.dptk .Loop0
124:
125: .Lunroll:
126: add r14 = -2, r14 ;;
127: mov ar.lc = r14
128:
129: .Lphase1:
130: { .mmi
131: ld8 r16 = [r33], UPD ;;
132: } { .mmi
133: ld8 r17 = [r33], UPD ;;
134: } { .mmi
135: ld8 r18 = [r33], UPD
136: FSH r26 = r19, r35 ;;
137: } { .mmi
138: ld8 r19 = [r33], UPD
139: BSH r27 = r16, r31 ;;
140: } { .mib
141: FSH r20 = r16, r35
142: }
143:
144: .Lphase2:
145: { .mmi
146: ld8 r16 = [r33], UPD
147: BSH r21 = r17, r31
148: } { .mib
149: FSH r22 = r17, r35 ;;
150: } { .mmi
151: ld8 r17 = [r33], UPD
152: BSH r23 = r18, r31
153: } { .mib
154: or r27 = r27, r26
155: FSH r24 = r18, r35
156: br.cloop.dptk .Loop
157: }
158: br.sptk .Lend2
159: .Loop:
160: { .mmi
161: st8 [r32] = r27, UPD
162: ld8 r18 = [r33], UPD
163: BSH r25 = r19, r31
164: } { .mib
165: or r21 = r21, r20
166: FSH r26 = r19, r35 ;;
167: } { .mmi
168: st8 [r32] = r21, UPD
169: ld8 r19 = [r33], UPD
170: BSH r27 = r16, r31
171: } { .mib
172: or r23 = r23, r22
173: FSH r20 = r16, r35 ;;
174: } { .mmi
175: st8 [r32] = r23, UPD
176: ld8 r16 = [r33], UPD
177: BSH r21 = r17, r31
178: } { .mib
179: or r25 = r25, r24
180: FSH r22 = r17, r35 ;;
181: } { .mmi
182: st8 [r32] = r25, UPD
183: ld8 r17 = [r33], UPD
184: BSH r23 = r18, r31
185: } { .mib
186: or r27 = r27, r26
187: FSH r24 = r18, r35
188: br.cloop.sptk .Loop;;
189: }
190: .Lend2:
191: { .mmi
192: st8 [r32] = r27, UPD
193: ld8 r18 = [r33], UPD
194: BSH r25 = r19, r31
195: } { .mib
196: or r21 = r21, r20
197: FSH r26 = r19, r35 ;;
198: } { .mmi
199: st8 [r32] = r21, UPD
200: BSH r27 = r16, r31
201: } { .mib
202: or r23 = r23, r22
203: FSH r20 = r16, r35 ;;
204: } { .mmi
205: st8 [r32] = r23, UPD
206: BSH r21 = r17, r31
207: } { .mib
208: or r25 = r25, r24
209: FSH r22 = r17, r35 ;;
210: } { .mmi
211: st8 [r32] = r25, UPD
212: BSH r23 = r18, r31
213: } { .mib
214: or r27 = r27, r26
215: FSH r24 = r18, r35 ;;
216: }
217:
218: { .mmi
219: st8 [r32] = r27, UPD
220: } { .mib
221: or r21 = r21, r20 ;;
222: } { .mmi
223: st8 [r32] = r21, UPD
224: } { .mib
225: or r23 = r23, r22 ;;
226: } { .mmi
227: st8 [r32] = r23, UPD;;
228: } { .mmi
229: st8 [r32] = r24
230: }
231: mov ar.lc = r2
232: br.ret.sptk.many b0
233: EPILOGUE(func)
234: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>