Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mod_34lsub1.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K6: 2.66 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
! 29: C
! 30: C An attempt was made to use a loop like
! 31: C
! 32: C L(top):
! 33: C adcl (%edx), %eax
! 34: C adcl 4(%edx), %ebx
! 35: C adcl 8(%edx), %esi
! 36: C leal 12(%edx), %edx
! 37: C loop L(top)
! 38: C
! 39: C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
! 40: C The form used instead can save about 6 cycles by not dividing by 3.
! 41: C
! 42: C In the code used, putting the "leal"s at the top of the loop is necessary
! 43: C for the claimed speed, anywhere else costs an extra cycle per loop.
! 44: C Perhaps a tight loop like this needs short decode instructions at the
! 45: C branch target, which would explain the leal/loop form above taking 8
! 46: C cycles instead of 7 too.
! 47:
! 48: defframe(PARAM_SIZE, 8)
! 49: defframe(PARAM_SRC, 4)
! 50:
! 51: dnl re-use parameter space
! 52: define(SAVE_EBX, `PARAM_SIZE')
! 53: define(SAVE_ESI, `PARAM_SRC')
! 54:
! 55: TEXT
! 56: ALIGN(16)
! 57: PROLOGUE(mpn_mod_34lsub1)
! 58: deflit(`FRAME',0)
! 59:
! 60: movl PARAM_SIZE, %eax
! 61: movl PARAM_SRC, %edx
! 62:
! 63: subl $2, %eax
! 64: ja L(three_or_more)
! 65:
! 66: Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary
! 67: jne L(one)
! 68:
! 69: movl %eax, %ecx
! 70: movl 4(%edx), %edx
! 71:
! 72: shrl $24, %eax C src[0] high
! 73: andl $0x00FFFFFF, %ecx C src[0] low
! 74:
! 75: addl %ecx, %eax
! 76: movl %edx, %ecx
! 77:
! 78: shll $8, %edx
! 79: andl $0x00FFFF00, %edx C src[1] high
! 80:
! 81: shrl $16, %ecx C src[1] low
! 82: addl %ecx, %eax
! 83:
! 84: addl %edx, %eax
! 85:
! 86: L(one):
! 87: ret
! 88:
! 89:
! 90: L(three_or_more):
! 91: C eax size-2
! 92: C ebx
! 93: C ecx
! 94: C edx src
! 95:
! 96: movl %ebx, SAVE_EBX
! 97: xorl %ebx, %ebx
! 98:
! 99: movl %esi, SAVE_ESI
! 100: pushl %edi FRAME_pushl()
! 101:
! 102: xorl %esi, %esi
! 103: xorl %edi, %edi C and clear carry flag
! 104:
! 105: L(top):
! 106: C eax counter, limbs
! 107: C ebx acc 0mod3
! 108: C ecx
! 109: C edx src, incrementing
! 110: C esi acc 1mod3
! 111: C edi acc 2mod3
! 112: C ebp
! 113:
! 114: leal -2(%eax), %eax
! 115: leal 12(%edx), %edx
! 116:
! 117: adcl -12(%edx), %ebx
! 118: adcl -8(%edx), %esi
! 119: adcl -4(%edx), %edi
! 120:
! 121: decl %eax
! 122: jg L(top)
! 123:
! 124:
! 125: C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
! 126:
! 127: movb $0, %cl
! 128: incl %eax
! 129:
! 130: js L(combine) C 0 more
! 131:
! 132: Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings
! 133:
! 134: movb $8, %cl
! 135: decl %eax
! 136:
! 137: js L(combine) C 1 more
! 138:
! 139: adcl 4(%edx), %esi
! 140:
! 141: movb $16, %cl
! 142:
! 143:
! 144: L(combine):
! 145: sbbl %edx, %edx
! 146:
! 147: shll %cl, %edx C carry
! 148: movl %ebx, %eax C 0mod3
! 149:
! 150: shrl $24, %eax C 0mod3 high
! 151: andl $0x00FFFFFF, %ebx C 0mod3 low
! 152:
! 153: subl %edx, %eax C apply carry
! 154: movl %esi, %ecx C 1mod3
! 155:
! 156: shrl $16, %esi C 1mod3 high
! 157: addl %ebx, %eax C apply 0mod3 low
! 158:
! 159: andl $0x0000FFFF, %ecx
! 160: addl %esi, %eax C apply 1mod3 high
! 161:
! 162: shll $8, %ecx C 1mod3 low
! 163: movl %edi, %edx C 2mod3
! 164:
! 165: shrl $8, %edx C 2mod3 high
! 166: addl %ecx, %eax C apply 1mod3 low
! 167:
! 168: addl %edx, %eax C apply 2mod3 high
! 169: andl $0x000000FF, %edi
! 170:
! 171: shll $16, %edi C 2mod3 low
! 172: movl SAVE_EBX, %ebx
! 173:
! 174: addl %edi, %eax C apply 2mod3 low
! 175: movl SAVE_ESI, %esi
! 176:
! 177: popl %edi
! 178:
! 179: ret
! 180:
! 181: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>