Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/hamdist.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel P5 mpn_hamdist -- mpn hamming distance.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P5: 14.0 cycles/limb
26:
27:
28: C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
29: C
30: C It might be possible to shave 1 cycle from the loop, and hence 2
31: C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor
32: C would be 1, if the right schedule could be found (not found so far).
33: C Wanting to avoid potential cache bank clashes makes it tricky.
34:
35: C The slightly strange quoting here helps the renaming done by tune/many.pl.
36: deflit(TABLE_NAME,
37: m4_assert_defined(`GSYM_PREFIX')
38: GSYM_PREFIX`'mpn_popcount``'_table')
39:
40: defframe(PARAM_SIZE,12)
41: defframe(PARAM_SRC2, 8)
42: defframe(PARAM_SRC1, 4)
43:
44: TEXT
45: ALIGN(8)
46:
47: PROLOGUE(mpn_hamdist)
48: deflit(`FRAME',0)
49:
50: movl PARAM_SIZE, %ecx
51: pushl %esi FRAME_pushl()
52:
53: shll %ecx C size in byte pairs
54: pushl %edi FRAME_pushl()
55:
56: ifdef(`PIC',`
57: pushl %ebx FRAME_pushl()
58: pushl %ebp FRAME_pushl()
59:
60: call L(here) FRAME_pushl()
61: L(here):
62: movl PARAM_SRC1, %esi
63: popl %ebp FRAME_popl()
64:
65: movl PARAM_SRC2, %edi
66: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
67:
68: xorl %ebx, %ebx C byte
69: xorl %edx, %edx C byte
70:
71: movl TABLE_NAME@GOT(%ebp), %ebp
72: xorl %eax, %eax C total
73: define(TABLE,`(%ebp,$1)')
74:
75: ',`
76: dnl non-PIC
77: movl PARAM_SRC1, %esi
78: movl PARAM_SRC2, %edi
79:
80: xorl %eax, %eax C total
81: pushl %ebx FRAME_pushl()
82:
83: xorl %edx, %edx C byte
84: xorl %ebx, %ebx C byte
85:
86: define(TABLE,`TABLE_NAME($1)')
87: ')
88:
89:
90: C The nop after the xorb seems necessary. Although a movb might be
91: C expected to go down the V pipe in the second cycle of the xorb, it
92: C doesn't and costs an extra 2 cycles.
93: L(top):
94: C eax total
95: C ebx byte
96: C ecx counter, 2*size to 2
97: C edx byte
98: C esi src1
99: C edi src2
100: C ebp [PIC] table
101:
102: addl %ebx, %eax
103: movb -1(%esi,%ecx,2), %bl
104:
105: addl %edx, %eax
106: movb -1(%edi,%ecx,2), %dl
107:
108: xorb %dl, %bl
109: movb -2(%esi,%ecx,2), %dl
110:
111: xorb -2(%edi,%ecx,2), %dl
112: nop
113:
114: movb TABLE(%ebx), %bl
115: decl %ecx
116:
117: movb TABLE(%edx), %dl
118: jnz L(top)
119:
120:
121: ifdef(`PIC',`
122: popl %ebp
123: ')
124: addl %ebx, %eax
125: popl %ebx
126:
127: addl %edx, %eax
128: popl %edi
129:
130: popl %esi
131:
132: ret
133:
134: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>