Annotation of OpenXM_contrib/gmp/mpn/x86/k6/pre_mod_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
2:
3: dnl Copyright 2000, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K6: 18.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
29: C mp_limb_t inverse);
30: C
31: C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
32: C considered worthwhile (just).
33: C
34: C Future:
35: C
36: C In theory this code should be made available in mod_1 and mod_1c, but it
37: C would take quite a while to overcome the time to calculate an inverse.
38: C The threshold would probably be around 20 limbs, or around 30 for an
39: C unnormalized divisor.
40:
41: defframe(PARAM_INVERSE,16)
42: defframe(PARAM_DIVISOR,12)
43: defframe(PARAM_SIZE, 8)
44: defframe(PARAM_SRC, 4)
45:
46: TEXT
47: ALIGN(32)
48: PROLOGUE(mpn_preinv_mod_1)
49: deflit(`FRAME',0)
50:
51: ASSERT(ae,`cmpl $1, PARAM_SIZE')
52: ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
53:
54: movl PARAM_SIZE, %ecx
55: pushl %ebp FRAME_pushl()
56:
57: movl PARAM_SRC, %ebp
58: pushl %edi FRAME_pushl()
59:
60: movl PARAM_DIVISOR, %eax
61: pushl %esi FRAME_pushl()
62:
63: movl -4(%ebp,%ecx,4), %esi C src high limb
64: pushl %ebx FRAME_pushl()
65:
66: movl %edx, %edi C first n2 to cancel
67: subl %eax, %esi C first n1 = high-divisor
68:
69: decl %ecx
70: jz L(done_sbbl)
71:
72: L(top):
73: C eax scratch
74: C ebx n10, nadj, q1
75: C ecx counter, size to 1
76: C edx scratch
77: C esi n2
78: C edi old high, for underflow test
79: C ebp src
80:
81: sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
82:
83: L(entry):
84: andl PARAM_DIVISOR, %edi
85: L(q1_ff_top):
86: movl -4(%ebp,%ecx,4), %ebx
87:
88: addl %esi, %edi C possible addback
89: movl %ebx, %esi C n10
90:
91: sarl $31, %ebx C -n1 = 0 or -1
92: movl %edi, %eax C n2
93:
94: movl PARAM_INVERSE, %edx
95: subl %ebx, %eax C n2+n1
96:
97: mull %edx C m*(n2+n1)
98:
99: andl PARAM_DIVISOR, %ebx C -n1 & d
100: addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow
101:
102: addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag
103: leal 1(%edi), %ebx C n2+1
104:
105: adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
106:
107: movl PARAM_DIVISOR, %eax C d
108: jz L(q1_ff)
109:
110: mull %edx C (q1+1)*d
111:
112: subl %eax, %esi C low n-(q1+1)*d
113: loop L(top)
114:
115:
116:
117: L(done_sbbl):
118: sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
119:
120: andl PARAM_DIVISOR, %edi
121: L(done_esi_edi):
122: popl %ebx
123:
124: leal (%esi,%edi), %eax
125: popl %esi
126:
127: popl %edi
128: popl %ebp
129:
130: ret
131:
132:
133: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
134: C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely
135: C reached.
136:
137: L(q1_ff):
138: movl PARAM_DIVISOR, %edi
139: loop L(q1_ff_top)
140:
141: jmp L(done_esi_edi)
142:
143:
144: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>