Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyi.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
2: dnl
3: dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
4: dnl alignment.
5:
6:
7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8: dnl
9: dnl This file is part of the GNU MP Library.
10: dnl
11: dnl The GNU MP Library is free software; you can redistribute it and/or
12: dnl modify it under the terms of the GNU Lesser General Public License as
13: dnl published by the Free Software Foundation; either version 2.1 of the
14: dnl License, or (at your option) any later version.
15: dnl
16: dnl The GNU MP Library is distributed in the hope that it will be useful,
17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19: dnl Lesser General Public License for more details.
20: dnl
21: dnl You should have received a copy of the GNU Lesser General Public
22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24: dnl Suite 330, Boston, MA 02111-1307, USA.
25:
26:
27: include(`../config.m4')
28:
29:
30: dnl K6-2 aligned:
31: dnl UNROLL_COUNT cycles/limb
32: dnl 8 0.75
33: dnl 16 0.625
34: dnl 32 0.5625
35: dnl 64 0.53
36: dnl Maximum possible with the current code is 64, the minimum is 2.
37:
38: deflit(UNROLL_COUNT, 32)
39:
40:
41: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
42: C
43: C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
44: C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
45: C used instead.
46: C
47: C mod8
48: C src dst
49: C 0 0 both aligned, use mmx
50: C 0 4 unaligned, use rep movs
51: C 4 0 unaligned, use rep movs
52: C 4 4 do one movs, then both aligned, use mmx
53: C
54: C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
55: C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
56: C
57: C A pattern of two movq loads and two movq stores (or four and four) was
58: C tried, but found to be the same speed as just one of each.
59: C
60: C Note that this code only suits K6-2 and K6-3. Plain K6 does only one mmx
61: C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
62: C movs.
63: C
64: C Enhancement:
65: C
66: C Addressing modes like disp(%esi,%ecx,4) aren't currently used. They'd
67: C make it possible to avoid incrementing %esi and %edi in the loop and hence
68: C get loop overhead down to 1 cycle. Care would be needed to avoid bad
69: C cache line crossings since the "movq"s would then be 5 code bytes rather
70: C than 4.
71:
72:
73: defframe(PARAM_SIZE,12)
74: defframe(PARAM_SRC, 8)
75: defframe(PARAM_DST, 4)
76: deflit(`FRAME',0)
77:
78: .text
79: ALIGN(32)
80:
81: PROLOGUE(mpn_copyi)
82: movl PARAM_SIZE, %ecx
83: movl %esi, %eax
84:
85: movl PARAM_SRC, %esi
86: movl %edi, %edx
87:
88: cld
89:
90: movl PARAM_DST, %edi
91: cmpl $UNROLL_COUNT, %ecx
92:
93: ja L(unroll)
94:
95: L(simple):
96: rep
97: movsl
98:
99: movl %eax, %esi
100: movl %edx, %edi
101:
102: ret
103:
104:
105: L(unroll):
106: C if src and dst are different alignments mod8, then use rep movs
107: C if src and dst are both 4mod8 then process one limb to get 0mod8
108:
109: pushl %ebx
110: leal (%esi,%edi), %ebx
111:
112: testb $4, %bl
113: popl %ebx
114:
115: jnz L(simple)
116: testl $4, %esi
117:
118: leal -UNROLL_COUNT(%ecx), %ecx
119: jz L(already_aligned)
120:
121: decl %ecx
122:
123: movsl
124: L(already_aligned):
125:
126:
127: ifelse(UNROLL_BYTES,256,`
128: addl $128, %esi
129: addl $128, %edi
130: ')
131:
132: C this is offset 0x34, no alignment needed
133: L(top):
134: C eax saved esi
135: C ebx
136: C ecx counter, limbs
137: C edx saved edi
138: C esi src, incrementing
139: C edi dst, incrementing
140: C ebp
141: C
142: C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
143: C 0(%edi) keeps code aligned to 16 byte boundaries.
144:
145: deflit(CHUNK_COUNT, 2)
146: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
147: deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
148: Zdisp( movq, disp,(%esi), %mm0)
149: Zdisp( movq, %mm0, disp,(%edi))
150: ')
151:
152: addl $UNROLL_BYTES, %esi
153: subl $UNROLL_COUNT, %ecx
154:
155: leal UNROLL_BYTES(%edi), %edi
156: jns L(top)
157:
158:
159: C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
160: C UNROLL_COUNT-1 limbs remaining
161:
162: testb $eval(UNROLL_COUNT/2), %cl
163:
164: leal UNROLL_COUNT(%ecx), %ecx
165: jz L(not_half)
166:
167: C at an unroll count of 32 this block of code is 16 cycles faster than
168: C the rep movs, less 3 or 4 to test whether to do it
169:
170: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
171: deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
172: movq disp(%esi), %mm0
173: movq %mm0, disp(%edi)
174: ')
175: addl $eval(UNROLL_BYTES/2), %esi
176: addl $eval(UNROLL_BYTES/2), %edi
177:
178: subl $eval(UNROLL_COUNT/2), %ecx
179: L(not_half):
180:
181:
182: ifelse(UNROLL_BYTES,256,`
183: subl $128, %esi
184: subl $128, %edi
185: ')
186:
187: rep
188: movsl
189:
190: movl %eax, %esi
191: movl %edx, %edi
192:
193: femms
194: ret
195:
196: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>