Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
2: dnl
3: dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
4: dnl alignment.
5:
6:
7: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8: dnl
9: dnl This file is part of the GNU MP Library.
10: dnl
11: dnl The GNU MP Library is free software; you can redistribute it and/or
12: dnl modify it under the terms of the GNU Lesser General Public License as
13: dnl published by the Free Software Foundation; either version 2.1 of the
14: dnl License, or (at your option) any later version.
15: dnl
16: dnl The GNU MP Library is distributed in the hope that it will be useful,
17: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19: dnl Lesser General Public License for more details.
20: dnl
21: dnl You should have received a copy of the GNU Lesser General Public
22: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24: dnl Suite 330, Boston, MA 02111-1307, USA.
25:
26:
27: include(`../config.m4')
28:
29:
30: dnl K6-2 aligned:
31: dnl UNROLL_COUNT cycles/limb
32: dnl 8 0.75
33: dnl 16 0.625
34: dnl 32 0.5625
35: dnl 64 0.53
36: dnl Maximum possible with the current code is 64, the minimum is 2.
37:
38: deflit(UNROLL_COUNT, 32)
39:
40:
41: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
42: C
43: C Copy src,size to dst,size, processing limbs from high to low addresses.
44: C
45: C The comments in copyi.asm apply here too.
46:
47:
48: defframe(PARAM_SIZE,12)
49: defframe(PARAM_SRC, 8)
50: defframe(PARAM_DST, 4)
51: deflit(`FRAME',0)
52:
53: .text
54: ALIGN(32)
55:
56: PROLOGUE(mpn_copyd)
57: movl PARAM_SIZE, %ecx
58: movl %esi, %eax
59:
60: movl PARAM_SRC, %esi
61: movl %edi, %edx
62:
63: std
64:
65: movl PARAM_DST, %edi
66: cmpl $UNROLL_COUNT, %ecx
67:
68: leal -4(%esi,%ecx,4), %esi
69:
70: leal -4(%edi,%ecx,4), %edi
71: ja L(unroll)
72:
73: L(simple):
74: rep
75: movsl
76:
77: cld
78:
79: movl %eax, %esi
80: movl %edx, %edi
81:
82: ret
83:
84:
85: L(unroll):
86: C if src and dst are different alignments mod8, then use rep movs
87: C if src and dst are both 4mod8 then process one limb to get 0mod8
88:
89: pushl %ebx
90: leal (%esi,%edi), %ebx
91:
92: testb $4, %bl
93: popl %ebx
94:
95: jnz L(simple)
96: testl $4, %esi
97:
98: leal -UNROLL_COUNT(%ecx), %ecx
99: jnz L(already_aligned)
100:
101: movsl
102:
103: decl %ecx
104: L(already_aligned):
105:
106:
107: ifelse(UNROLL_BYTES,256,`
108: subl $128, %esi
109: subl $128, %edi
110: ')
111:
112: C offset 0x3D here, but gets full speed without further alignment
113: L(top):
114: C eax saved esi
115: C ebx
116: C ecx counter, limbs
117: C edx saved edi
118: C esi src, incrementing
119: C edi dst, incrementing
120: C ebp
121: C
122: C `disp' is never 0, so don't need to force 0(%esi).
123:
124: deflit(CHUNK_COUNT, 2)
125: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
126: deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
127: movq disp(%esi), %mm0
128: movq %mm0, disp(%edi)
129: ')
130:
131: leal -UNROLL_BYTES(%esi), %esi
132: subl $UNROLL_COUNT, %ecx
133:
134: leal -UNROLL_BYTES(%edi), %edi
135: jns L(top)
136:
137:
138: C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
139: C UNROLL_COUNT-1 limbs remaining
140:
141: testb $eval(UNROLL_COUNT/2), %cl
142:
143: leal UNROLL_COUNT(%ecx), %ecx
144: jz L(not_half)
145:
146:
147: C at an unroll count of 32 this block of code is 16 cycles faster than
148: C the rep movs, less 3 or 4 to test whether to do it
149:
150: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
151: deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
152: movq disp(%esi), %mm0
153: movq %mm0, disp(%edi)
154: ')
155:
156: subl $eval(UNROLL_BYTES/2), %esi
157: subl $eval(UNROLL_BYTES/2), %edi
158:
159: subl $eval(UNROLL_COUNT/2), %ecx
160: L(not_half):
161:
162:
163: ifelse(UNROLL_BYTES,256,`
164: addl $128, %esi
165: addl $128, %edi
166: ')
167:
168: rep
169: movsl
170:
171: cld
172:
173: movl %eax, %esi
174: movl %edx, %edi
175:
176: femms
177: ret
178:
179: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>