Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/copyi.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C alignment dst/src, A=0mod8 N=4mod8
! 26: C A/A A/N N/A N/N
! 27: C K7 0.75 1.0 1.0 0.75
! 28:
! 29:
1.1 maekawa 30: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
31: C
32: C Copy src,size to dst,size.
33: C
34: C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
35: C 1.33 c/l.
36: C
37: C The K7 can do two loads, or two stores, or a load and a store, in one
38: C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
39: C however nothing under 0.7 c/l is known.
40: C
41: C If both source and destination are unaligned then one limb is processed at
42: C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
43: C used unaligned it would be 1.5 c/l.
44:
45: defframe(PARAM_SIZE,12)
46: defframe(PARAM_SRC, 8)
47: defframe(PARAM_DST, 4)
48:
49: dnl parameter space reused
50: define(SAVE_EBX,`PARAM_SIZE')
51:
52: dnl minimum 5 since the unrolled code can't handle less than 5
53: deflit(UNROLL_THRESHOLD, 5)
54:
1.1.1.2 ! ohara 55: TEXT
1.1 maekawa 56: ALIGN(32)
57: PROLOGUE(mpn_copyi)
58: deflit(`FRAME',0)
59:
60: movl PARAM_SIZE, %ecx
61: movl %ebx, SAVE_EBX
62:
63: movl PARAM_SRC, %eax
64: movl PARAM_DST, %edx
65:
66: cmpl $UNROLL_THRESHOLD, %ecx
67: jae L(unroll)
68:
69: orl %ecx, %ecx
70: jz L(simple_done)
71:
72: L(simple):
73: C eax src, incrementing
74: C ebx scratch
75: C ecx counter
76: C edx dst, incrementing
77: C
78: C this loop is 2 cycles/limb
79:
80: movl (%eax), %ebx
81: movl %ebx, (%edx)
82: decl %ecx
83: leal 4(%eax), %eax
84: leal 4(%edx), %edx
85: jnz L(simple)
86:
87: L(simple_done):
88: movl SAVE_EBX, %ebx
89: ret
90:
91:
92: L(unroll):
93: movl %eax, %ebx
94: leal -12(%eax,%ecx,4), %eax C src end - 12
95: subl $3, %ecx C size-3
96:
97: andl %edx, %ebx
98: leal (%edx,%ecx,4), %edx C dst end - 12
99: negl %ecx
100:
101: testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
102: jz L(aligned)
103:
104: C both src and dst unaligned, process one limb to align them
105: movl (%eax,%ecx,4), %ebx
106: movl %ebx, (%edx,%ecx,4)
107: incl %ecx
108: L(aligned):
109:
110:
111: ALIGN(16)
112: L(top):
113: C eax src end - 12
114: C ebx
115: C ecx counter, negative, limbs
116: C edx dst end - 12
117:
118: movq (%eax,%ecx,4), %mm0
119: movq 8(%eax,%ecx,4), %mm1
120: addl $4, %ecx
121: movq %mm0, -16(%edx,%ecx,4)
122: movq %mm1, -16+8(%edx,%ecx,4)
123: ja L(top) C jump no carry and not zero
124:
125:
126: C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
127:
128: testb $2, %cl
129: jnz L(finish_not_two)
130:
131: movq (%eax,%ecx,4), %mm0
132: movq %mm0, (%edx,%ecx,4)
133: L(finish_not_two):
134:
135: testb $1, %cl
136: jnz L(done)
137:
138: movl 8(%eax), %ebx
139: movl %ebx, 8(%edx)
140:
141: L(done):
142: movl SAVE_EBX, %ebx
143: emms
144: ret
145:
146: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>