Annotation of OpenXM_contrib/gmp/mpn/x86/k7/dive_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K7: 11.0 cycles/limb
26:
27:
28: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C mp_limb_t divisor);
30: C
31: C The dependent chain is mul+imul+sub for 11 cycles and that speed is
32: C achieved with no special effort. The load and shrld latencies are hidden
33: C by out of order execution.
34: C
35: C It's a touch faster on size==1 to use the mul-by-inverse than divl.
36:
37: defframe(PARAM_DIVISOR,16)
38: defframe(PARAM_SIZE, 12)
39: defframe(PARAM_SRC, 8)
40: defframe(PARAM_DST, 4)
41:
42: defframe(SAVE_EBX, -4)
43: defframe(SAVE_ESI, -8)
44: defframe(SAVE_EDI, -12)
45: defframe(SAVE_EBP, -16)
46: defframe(VAR_INVERSE, -20)
47: defframe(VAR_DST_END, -24)
48:
49: deflit(STACK_SPACE, 24)
50:
51: TEXT
52:
53: ALIGN(16)
54: PROLOGUE(mpn_divexact_1)
55: deflit(`FRAME',0)
56:
57: movl PARAM_DIVISOR, %eax
58: subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
59: movl $-1, %ecx C shift count
60:
61: movl %ebp, SAVE_EBP
62: movl PARAM_SIZE, %ebp
63:
64: movl %esi, SAVE_ESI
65: movl %edi, SAVE_EDI
66:
67: C If there's usually only one or two trailing zero bits then this
68: C should be faster than bsfl.
69: L(strip_twos):
70: incl %ecx
71: shrl %eax
72: jnc L(strip_twos)
73:
74: movl %ebx, SAVE_EBX
75: leal 1(%eax,%eax), %ebx C d without twos
76: andl $127, %eax C d/2, 7 bits
77:
78: ifdef(`PIC',`
79: call L(movl_eip_edx)
80:
81: addl $_GLOBAL_OFFSET_TABLE_, %edx
82:
83: movl modlimb_invert_table@GOT(%edx), %edx
84:
85: movzbl (%eax,%edx), %eax C inv 8 bits
86: ',`
87: dnl non-PIC
88: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
89: ')
90:
91: leal (%eax,%eax), %edx C 2*inv
92: movl %ebx, PARAM_DIVISOR C d without twos
93:
94: imull %eax, %eax C inv*inv
95:
96: movl PARAM_SRC, %esi
97: movl PARAM_DST, %edi
98:
99: imull %ebx, %eax C inv*inv*d
100:
101: subl %eax, %edx C inv = 2*inv - inv*inv*d
102: leal (%edx,%edx), %eax C 2*inv
103:
104: imull %edx, %edx C inv*inv
105:
106: leal (%esi,%ebp,4), %esi C src end
107: leal (%edi,%ebp,4), %edi C dst end
108: negl %ebp C -size
109:
110: imull %ebx, %edx C inv*inv*d
111:
112: subl %edx, %eax C inv = 2*inv - inv*inv*d
113:
114: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
115: pushl %eax FRAME_pushl()
116: imull PARAM_DIVISOR, %eax
117: cmpl $1, %eax
118: popl %eax FRAME_popl()')
119:
120: movl %eax, VAR_INVERSE
121: movl (%esi,%ebp,4), %eax C src[0]
122:
123: incl %ebp
124: jz L(one)
125:
126: movl (%esi,%ebp,4), %edx C src[1]
127:
128: shrdl( %cl, %edx, %eax)
129:
130: movl %edi, VAR_DST_END
131: xorl %ebx, %ebx
132: jmp L(entry)
133:
134: ifdef(`PIC',`
135: L(movl_eip_edx):
136: movl (%esp), %edx
137: ret
138: ')
139:
140: ALIGN(8)
141: L(top):
142: C eax q
143: C ebx carry bit, 0 or 1
144: C ecx shift
145: C edx
146: C esi src end
147: C edi dst end
148: C ebp counter, limbs, negative
149:
150: mull PARAM_DIVISOR C carry limb in edx
151:
152: movl -4(%esi,%ebp,4), %eax
153: movl (%esi,%ebp,4), %edi
154:
155: shrdl( %cl, %edi, %eax)
156:
157: subl %ebx, %eax C apply carry bit
158: setc %bl
159: movl VAR_DST_END, %edi
160:
161: subl %edx, %eax C apply carry limb
162: adcl $0, %ebx
163:
164: L(entry):
165: imull VAR_INVERSE, %eax
166:
167: movl %eax, -4(%edi,%ebp,4)
168: incl %ebp
169: jnz L(top)
170:
171:
172: mull PARAM_DIVISOR C carry limb in edx
173:
174: movl -4(%esi), %eax C src high limb
175: shrl %cl, %eax
176: movl SAVE_ESI, %esi
177:
178: subl %ebx, %eax C apply carry bit
179: movl SAVE_EBX, %ebx
180: movl SAVE_EBP, %ebp
181:
182: subl %edx, %eax C apply carry limb
183:
184: imull VAR_INVERSE, %eax
185:
186: movl %eax, -4(%edi)
187: movl SAVE_EDI, %edi
188: addl $STACK_SPACE, %esp
189:
190: ret
191:
192:
193: L(one):
194: shrl %cl, %eax
195: movl SAVE_ESI, %esi
196: movl SAVE_EBX, %ebx
197:
198: imull VAR_INVERSE, %eax
199:
200: movl SAVE_EBP, %ebp
201: movl %eax, -4(%edi)
202:
203: movl SAVE_EDI, %edi
204: addl $STACK_SPACE, %esp
205:
206: ret
207:
208: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>