Annotation of OpenXM_contrib/gmp/mpn/x86/k6/diveby3.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
2:
1.1.1.2 ! ohara 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K6: 11.0 cycles/limb
! 26:
! 27:
1.1 maekawa 28: C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C mp_limb_t carry);
30: C
31: C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
32: C lead to vector decoding, unlike plain (%esi) does.
33:
34: defframe(PARAM_CARRY,16)
35: defframe(PARAM_SIZE, 12)
36: defframe(PARAM_SRC, 8)
37: defframe(PARAM_DST, 4)
38:
39: dnl multiplicative inverse of 3, modulo 2^32
40: deflit(INVERSE_3, 0xAAAAAAAB)
41:
1.1.1.2 ! ohara 42: TEXT
1.1 maekawa 43: ALIGN(32)
44:
45: PROLOGUE(mpn_divexact_by3c)
46: deflit(`FRAME',0)
47:
48: movl PARAM_SIZE, %ecx
49: pushl %esi defframe_pushl(SAVE_ESI)
50:
51: movl PARAM_SRC, %esi
52: pushl %edi defframe_pushl(SAVE_EDI)
53:
54: movl PARAM_DST, %edi
55: pushl %ebx defframe_pushl(SAVE_EBX)
56:
57: movl PARAM_CARRY, %ebx
58: leal (%esi,%ecx,4), %esi
59:
60: pushl $3 defframe_pushl(VAR_THREE)
61: leal (%edi,%ecx,4), %edi
62:
63: negl %ecx
64:
65:
66: C Need 32 alignment for claimed speed, to avoid the movl store
67: C opcode/modrm crossing a cache line boundary
68:
69: ALIGN(32)
70: L(top):
71: C eax scratch, low product
1.1.1.2 ! ohara 72: C ebx carry limb (0 to 2)
1.1 maekawa 73: C ecx counter, limbs, negative
74: C edx scratch, high product
75: C esi &src[size]
76: C edi &dst[size]
77: C ebp
78: C
79: C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
80: C doesn't cross a 32 byte boundary, saving a couple of cycles
81: C (that's a fixed couple, not per loop).
82:
83: Zdisp( movl, 0,(%esi,%ecx,4), %eax)
84: subl %ebx, %eax
85:
86: setc %bl
87:
1.1.1.2 ! ohara 88: imull $INVERSE_3, %eax, %eax
1.1 maekawa 89:
90: movl %eax, (%edi,%ecx,4)
91: addl $2, %ecx
92:
93: mull VAR_THREE
94:
95: addl %edx, %ebx
96: loop L(top)
97:
98:
99: movl SAVE_ESI, %esi
100: movl %ebx, %eax
101:
102: movl SAVE_EBX, %ebx
103:
104: movl SAVE_EDI, %edi
105: addl $FRAME, %esp
106:
107: ret
108:
109: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>