Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mod_34lsub1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
2:
3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C K6: 2.66 cycles/limb
26:
27:
28: C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
29: C
30: C An attempt was made to use a loop like
31: C
32: C L(top):
33: C adcl (%edx), %eax
34: C adcl 4(%edx), %ebx
35: C adcl 8(%edx), %esi
36: C leal 12(%edx), %edx
37: C loop L(top)
38: C
39: C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
40: C The form used instead can save about 6 cycles by not dividing by 3.
41: C
42: C In the code used, putting the "leal"s at the top of the loop is necessary
43: C for the claimed speed, anywhere else costs an extra cycle per loop.
44: C Perhaps a tight loop like this needs short decode instructions at the
45: C branch target, which would explain the leal/loop form above taking 8
46: C cycles instead of 7 too.
47:
48: defframe(PARAM_SIZE, 8)
49: defframe(PARAM_SRC, 4)
50:
51: dnl re-use parameter space
52: define(SAVE_EBX, `PARAM_SIZE')
53: define(SAVE_ESI, `PARAM_SRC')
54:
55: TEXT
56: ALIGN(16)
57: PROLOGUE(mpn_mod_34lsub1)
58: deflit(`FRAME',0)
59:
60: movl PARAM_SIZE, %eax
61: movl PARAM_SRC, %edx
62:
63: subl $2, %eax
64: ja L(three_or_more)
65:
66: Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary
67: jne L(one)
68:
69: movl %eax, %ecx
70: movl 4(%edx), %edx
71:
72: shrl $24, %eax C src[0] high
73: andl $0x00FFFFFF, %ecx C src[0] low
74:
75: addl %ecx, %eax
76: movl %edx, %ecx
77:
78: shll $8, %edx
79: andl $0x00FFFF00, %edx C src[1] high
80:
81: shrl $16, %ecx C src[1] low
82: addl %ecx, %eax
83:
84: addl %edx, %eax
85:
86: L(one):
87: ret
88:
89:
90: L(three_or_more):
91: C eax size-2
92: C ebx
93: C ecx
94: C edx src
95:
96: movl %ebx, SAVE_EBX
97: xorl %ebx, %ebx
98:
99: movl %esi, SAVE_ESI
100: pushl %edi FRAME_pushl()
101:
102: xorl %esi, %esi
103: xorl %edi, %edi C and clear carry flag
104:
105: L(top):
106: C eax counter, limbs
107: C ebx acc 0mod3
108: C ecx
109: C edx src, incrementing
110: C esi acc 1mod3
111: C edi acc 2mod3
112: C ebp
113:
114: leal -2(%eax), %eax
115: leal 12(%edx), %edx
116:
117: adcl -12(%edx), %ebx
118: adcl -8(%edx), %esi
119: adcl -4(%edx), %edi
120:
121: decl %eax
122: jg L(top)
123:
124:
125: C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
126:
127: movb $0, %cl
128: incl %eax
129:
130: js L(combine) C 0 more
131:
132: Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings
133:
134: movb $8, %cl
135: decl %eax
136:
137: js L(combine) C 1 more
138:
139: adcl 4(%edx), %esi
140:
141: movb $16, %cl
142:
143:
144: L(combine):
145: sbbl %edx, %edx
146:
147: shll %cl, %edx C carry
148: movl %ebx, %eax C 0mod3
149:
150: shrl $24, %eax C 0mod3 high
151: andl $0x00FFFFFF, %ebx C 0mod3 low
152:
153: subl %edx, %eax C apply carry
154: movl %esi, %ecx C 1mod3
155:
156: shrl $16, %esi C 1mod3 high
157: addl %ebx, %eax C apply 0mod3 low
158:
159: andl $0x0000FFFF, %ecx
160: addl %esi, %eax C apply 1mod3 high
161:
162: shll $8, %ecx C 1mod3 low
163: movl %edi, %edx C 2mod3
164:
165: shrl $8, %edx C 2mod3 high
166: addl %ecx, %eax C apply 1mod3 low
167:
168: addl %edx, %eax C apply 2mod3 high
169: andl $0x000000FF, %edi
170:
171: shll $16, %edi C 2mod3 low
172: movl SAVE_EBX, %ebx
173:
174: addl %edi, %eax C apply 2mod3 low
175: movl SAVE_ESI, %esi
176:
177: popl %edi
178:
179: ret
180:
181: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>