Annotation of OpenXM_contrib/gmp/mpn/x86/k6/gcd_finda.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K6 mpn_gcd_finda.
2:
3: dnl Copyright 2000, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K6: 680 cycles (approx) on average
26:
27:
28: dnl How many trial subtractions to attempt before launching into a full
29: dnl division.
30:
31: deflit(TRIAL_SUBS, 8)
32:
33:
34: C mp_limb_t mpn_gcd_finda (const mp_limb_t cp[2]);
35: C
36: C This code is probably not optimal, but it's already a good improvement
37: C over the generic C.
38: C
39:
40: defframe(PARAM_CP, 4)
41:
42: defframe(SAVE_EBX, -4)
43: defframe(SAVE_ESI, -8)
44: defframe(SAVE_EDI, -12)
45: defframe(SAVE_EBP, -16)
46:
47: defframe(VAR_N2H, -20)
48: defframe(VAR_N2L, -24)
49: defframe(VAR_Q, -28)
50: defframe(VAR_N2L_NORM, -32)
51:
52: deflit(STACK_SPACE, 32)
53:
54: TEXT
55: ALIGN(32)
56:
57: PROLOGUE(mpn_gcd_finda)
58: deflit(`FRAME',0)
59:
60: movl PARAM_CP, %eax
61: subl $STACK_SPACE, %esp
62: deflit(`FRAME',STACK_SPACE)
63:
64: movl %ebx, SAVE_EBX
65:
66: movl %esi, SAVE_ESI
67: movl (%eax), %ecx
68:
69: movl %edi, SAVE_EDI
70: movl 4(%eax), %edx
71:
72: movl %ebp, SAVE_EBP
73:
74: ASSERT(nz,`orl %ecx, %ecx')
75: ASSERT(nz,`orl %edx, %edx')
76:
77: movl %ecx, %eax
78: movl %edx, %ebx
79:
80: negl %eax
81: notl %ebx
82:
83: cmpl %ecx, %eax
84: movl %ebx, %esi
85:
86: sbbl %edx, %esi
87:
88: jbe L(top)
89:
90: movl %ecx, %eax
91: movl %edx, %ebx
92:
93: negl %ecx
94: notl %edx
95:
96: jmp L(top)
97:
98:
99: ALIGN(8)
100: L(restore):
101: C eax n2 l
102: C ebx n2 h
103: C ecx n1-n2 l
104: C edx n1-n2 h
105: C esi old n1 h
106: C edi
107: C ebp
108:
109: movl %ebx, %edx
110: movl %esi, %ebx
111:
112: movl %eax, %esi
113: addl %ecx, %eax
114:
115: movl %esi, %ecx
116:
117:
118: L(top):
119: C n1 >= n2
120: C
121: C eax n2 l
122: C ebx n2 h
123: C ecx n1 l
124: C edx n1 h
125: C esi
126: C edi
127: C ebp
128:
129: orl %ebx, %ebx
130: jz L(done)
131:
132: L(entry):
133: subl %eax, %ecx
134: sbbl %ebx, %edx
135: ASSERT(nc)
136:
137: forloop(i,1,TRIAL_SUBS,`
138: movl %edx, %esi
139: subl %eax, %ecx
140:
141: sbbl %ebx, %edx
142: jc L(restore)
143: ')
144:
145:
146: C n1 >= n2
147: C
148: C eax n2 l
149: C ebx n2 h
150: C ecx n1 l
151: C edx n1 h
152: C esi
153: C edi
154: C ebp
155:
156: movl %eax, VAR_N2L
157: movl %ecx, %esi C n1l
158:
159: bsrl %ebx, %ecx
160:
161: movl %ebx, VAR_N2H
162: notl %ecx C n2h leading zeros (low 5 bits)
163:
164: shldl( %cl, %eax, %ebx) C n2h normalized
165:
166: shll %cl, %eax C n2l normalized
167: movl %edx, %edi C n1h
168:
169: movl %eax, VAR_N2L_NORM
170: xorl %ebp, %ebp
171:
172: shldl( %cl, %edi, %ebp) C n1h shifted
173: shldl( %cl, %esi, %edi) C n1m shifted
174:
175: shll %cl, %esi C n1l shifted
176: movl %ebp, %edx
177:
178: movl %edi, %eax
179:
180: divl %ebx C n1h:n1m / n2h
181:
182: movl %edx, %edi C n1h:n1m:n1l - q*n2h
183: movl VAR_N2L_NORM, %edx
184:
185: mull %edx C q*n2l
186:
187: subl %eax, %esi
188: movl VAR_N2L_NORM, %ebp
189:
190: sbbl %edx, %edi C n1h:n1m:n1l - q*(n2h:n2l)
191:
192: jnc L(div_done)
193: addl %ebp, %esi
194:
195: adcl %ebx, %edi C addback n2h:n2l
196:
197: jc L(div_done)
198: addl %ebp, %esi
199:
200: adcl %ebx, %edi C further addback n2h:n2l
201: ASSERT(c)
202:
203: L(div_done):
204: shrdl( %cl, %edi, %esi)
205:
206: shrl %cl, %edi C unshift n1m:n1l remainder
207: movl %esi, %eax
208:
209: movl VAR_N2L, %ecx
210: movl %edi, %ebx
211:
212: movl VAR_N2H, %edx
213: orl %ebx, %ebx
214:
215: jnz L(entry)
216:
217:
218: L(done):
219: movl SAVE_EBX, %ebx
220: movl SAVE_ESI, %esi
221: movl SAVE_EDI, %edi
222: movl SAVE_EBP, %ebp
223: addl $STACK_SPACE, %esp
224: ret
225:
226: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>