Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mod_34lsub1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K7 mpn_mod_32lsub1 -- remainder modulo 2^24-1.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K7: 1.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
29: C
30: C The loop form below and the 64 byte code alignment seem necessary for the
31: C claimed speed. This is a bit strange, since normally k7 isn't very
32: C sensitive to such things. Perhaps there has to be 6 instructions in the
33: C first 16 bytes for the BTB entry or something.
34:
35: defframe(PARAM_SIZE, 8)
36: defframe(PARAM_SRC, 4)
37:
38: dnl re-use parameter space
39: define(SAVE_EDI, `PARAM_SIZE')
40:
41: TEXT
42: ALIGN(64)
43: PROLOGUE(mpn_mod_34lsub1)
44: deflit(`FRAME',0)
45:
46: movl PARAM_SIZE, %ecx
47: movl PARAM_SRC, %edx
48:
49: subl $2, %ecx
50: ja L(three_or_more)
51:
52: movl (%edx), %eax
53: jb L(one)
54:
55: movl 4(%edx), %ecx
56: movl %eax, %edx
57: shrl $24, %eax C src[0] low
58:
59: andl $0xFFFFFF, %edx C src[0] high
60: addl %edx, %eax
61: movl %ecx, %edx
62:
63: andl $0xFFFF, %ecx
64: shrl $16, %edx C src[1] high
65: addl %edx, %eax
66:
67: shll $8, %ecx C src[1] low
68: addl %ecx, %eax
69:
70: L(one):
71: ret
72:
73:
74: L(three_or_more):
75: C eax
76: C ebx
77: C ecx size-2
78: C edx src
79: C esi
80: C edi
81: C ebp
82:
83: pushl %ebx FRAME_pushl()
84: xorl %eax, %eax
85: xorl %ebx, %ebx
86:
87: movl %edi, SAVE_EDI
88: pushl %esi FRAME_pushl()
89: xorl %esi, %esi C and clear carry flag
90:
91:
92: C code offset 0x40 at this point
93: L(top):
94: C eax acc 0mod3
95: C ebx acc 1mod3
96: C ecx counter, limbs
97: C edx src
98: C esi acc 2mod3
99: C edi
100: C ebp
101:
102: leal 24(%edx), %edx
103: leal -2(%ecx), %ecx
104: adcl -24(%edx), %eax
105: adcl -20(%edx), %ebx
106: adcl -16(%edx), %esi
107:
108: decl %ecx
109: jng L(done_loop)
110:
111: leal -2(%ecx), %ecx
112: adcl -12(%edx), %eax
113: adcl -8(%edx), %ebx
114: adcl -4(%edx), %esi
115:
116: decl %ecx
117: jg L(top)
118:
119:
120: leal 12(%edx), %edx
121:
122:
123: L(done_loop):
124: C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
125:
126: incl %ecx
127: movl $0xFFFFFFFF, %edi
128: js L(combine)
129:
130: adcl -12(%edx), %eax
131: decl %ecx
132: movl $0xFFFFFF00, %edi
133: js L(combine)
134:
135: adcl -8(%edx), %ebx
136: movl $0xFFFF0000, %edi
137:
138:
139: L(combine):
140: C eax acc 0mod3
141: C ebx acc 1mod3
142: C ecx
143: C edx
144: C esi acc 2mod3
145: C edi mask
146: C ebp
147:
148: sbbl %ecx, %ecx C carry
149: movl %eax, %edx C 0mod3
150: shrl $24, %eax C 0mod3 high
151:
152: andl %edi, %ecx C carry masked
153: andl $0x00FFFFFF, %edx C 0mod3 low
154: movl %ebx, %edi C 1mod3
155:
156: subl %ecx, %eax C apply carry
157: shrl $16, %ebx C 1mod3 high
158: andl $0xFFFF, %edi
159:
160: addl %edx, %eax C apply 0mod3 low
161: movl %esi, %edx C 2mod3
162: shll $8, %edi C 1mod3 low
163:
164: addl %ebx, %eax C apply 1mod3 high
165: shrl $8, %esi C 2mod3 high
166: andl $0xFF, %edx C 2mod3 low
167:
168: addl %edi, %eax C apply 1mod3 low
169: shll $16, %edx C 2mod3 low
170:
171: addl %esi, %eax C apply 2mod3 high
172: popl %esi FRAME_popl()
173:
174: movl SAVE_EDI, %edi
175: addl %edx, %eax C apply 2mod3 low
176: popl %ebx FRAME_popl()
177:
178: ret
179:
180: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>