Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/diveby3.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel Pentium-4 mpn_divexact_by3 -- mpn exact division by 3.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Library General Public License as
9: dnl published by the Free Software Foundation; either version 2 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Library General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Library General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P4: 18.0 cycles/limb
26:
27:
28: C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C mp_limb_t carry);
30: C
31: C The dependent chain in the loop is as follows, and this is what the code
32: C measures.
33: C
34: C psubq (src-cbit) - climb 2
35: C pmuludq s*inverse 8
36: C pand mask q 2
37: C psllq 2*q 2
38: C paddq q+2*q 2
39: C psrlq high(3*q) 2
40: C --
41: C 18
42: C
43: C Perhaps the s*inverse can be taken off the dependent chain as described in
44: C mpn/generic/diveby3.c, with a modified 3*q calculation that can give
45: C high(3*q)*inv too.
46:
47:
48: defframe(PARAM_CARRY,16)
49: defframe(PARAM_SIZE, 12)
50: defframe(PARAM_SRC, 8)
51: defframe(PARAM_DST, 4)
52:
53: RODATA
54: C multiplicative inverse of 3, modulo 2^32
55: ALIGN(4)
56: L(inverse):
57: .long 0xAAAAAAAB
58:
59: TEXT
60: ALIGN(16)
61:
62: PROLOGUE(mpn_divexact_by3c)
63: deflit(`FRAME',0)
64:
65: movl PARAM_SRC, %eax
66: pxor %mm0, %mm0
67:
68: movd PARAM_CARRY, %mm1
69: pcmpeqd %mm6, %mm6
70:
71: movd L(inverse), %mm7
72:
73: movl PARAM_DST, %edx
74: psrlq $32, %mm6 C 0x00000000FFFFFFFF
75:
76: movl PARAM_SIZE, %ecx
77:
78: L(top):
79: C eax src, incrementing
80: C ebx
81: C ecx counter, limbs, decrementing
82: C edx dst, incrementing
83: C
84: C mm0 carry bit
85: C mm1 carry limb
86: C mm6 0x00000000FFFFFFFF
87: C mm7 inverse
88:
89: movd (%eax), %mm2
90: addl $4, %eax
91:
92: psubq %mm0, %mm2 C src - cbit
93:
94: psubq %mm1, %mm2 C src - cbit - climb
95: movq %mm2, %mm0
96: psrlq $63, %mm0 C new cbit
97:
98: pmuludq %mm7, %mm2 C s*inverse
99: movd %mm2, (%edx) C q
100: addl $4, %edx
101:
102: movq %mm6, %mm1
103:
104: pand %mm2, %mm1
105:
106: pand %mm6, %mm2
107:
108: psllq $1, %mm1
109:
110: C
111:
112: paddq %mm2, %mm1
113:
114: C
115:
116: psrlq $32, %mm1
117:
118: subl $1, %ecx
119: jnz L(top)
120:
121:
122: paddd %mm1, %mm0
123: movd %mm0, %eax
124: emms
125: ret
126:
127: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>