Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
! 2: dnl
! 3: dnl K6: 6.25 cycles/limb.
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 30: C mp_limb_t multiplier);
! 31: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 32: C mp_limb_t multiplier, mp_limb_t carry);
! 33: C
! 34: C Multiply src,size by mult and store the result in dst,size.
! 35: C Return the carry limb from the top of the result.
! 36: C
! 37: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
! 38: C the low limb of the result.
! 39:
! 40: defframe(PARAM_CARRY, 20)
! 41: defframe(PARAM_MULTIPLIER,16)
! 42: defframe(PARAM_SIZE, 12)
! 43: defframe(PARAM_SRC, 8)
! 44: defframe(PARAM_DST, 4)
! 45:
! 46: dnl minimum 5 because the unrolled code can't handle less
! 47: deflit(UNROLL_THRESHOLD, 5)
! 48:
! 49: .text
! 50: ALIGN(32)
! 51:
! 52: PROLOGUE(mpn_mul_1c)
! 53: pushl %esi
! 54: deflit(`FRAME',4)
! 55: movl PARAM_CARRY, %esi
! 56: jmp LF(mpn_mul_1,start_nc)
! 57: EPILOGUE()
! 58:
! 59:
! 60: PROLOGUE(mpn_mul_1)
! 61: push %esi
! 62: deflit(`FRAME',4)
! 63: xorl %esi, %esi C initial carry
! 64:
! 65: L(start_nc):
! 66: mov PARAM_SIZE, %ecx
! 67: push %ebx
! 68: FRAME_pushl()
! 69:
! 70: movl PARAM_SRC, %ebx
! 71: push %edi
! 72: FRAME_pushl()
! 73:
! 74: movl PARAM_DST, %edi
! 75: pushl %ebp
! 76: FRAME_pushl()
! 77:
! 78: cmpl $UNROLL_THRESHOLD, %ecx
! 79: movl PARAM_MULTIPLIER, %ebp
! 80:
! 81: jae L(unroll)
! 82:
! 83:
! 84: C code offset 0x22 here, close enough to aligned
! 85: L(simple):
! 86: C eax scratch
! 87: C ebx src
! 88: C ecx counter
! 89: C edx scratch
! 90: C esi carry
! 91: C edi dst
! 92: C ebp multiplier
! 93: C
! 94: C this loop 8 cycles/limb
! 95:
! 96: movl (%ebx), %eax
! 97: addl $4, %ebx
! 98:
! 99: mull %ebp
! 100:
! 101: addl %esi, %eax
! 102: movl $0, %esi
! 103:
! 104: adcl %edx, %esi
! 105:
! 106: movl %eax, (%edi)
! 107: addl $4, %edi
! 108:
! 109: loop L(simple)
! 110:
! 111:
! 112: popl %ebp
! 113:
! 114: popl %edi
! 115: popl %ebx
! 116:
! 117: movl %esi, %eax
! 118: popl %esi
! 119:
! 120: ret
! 121:
! 122:
! 123: C -----------------------------------------------------------------------------
! 124: C The code for each limb is 6 cycles, with instruction decoding being the
! 125: C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
! 126: C cycles/limb in total.
! 127: C
! 128: C The secret ingredient to get 6.25 is to start the loop with the mul and
! 129: C have the load/store pair at the end. Rotating the load/store to the top
! 130: C is an 0.5 c/l slowdown. (Some address generation effect probably.)
! 131: C
! 132: C The whole unrolled loop fits nicely in exactly 80 bytes.
! 133:
! 134:
! 135: ALIGN(16) C already aligned to 16 here actually
! 136: L(unroll):
! 137: movl (%ebx), %eax
! 138: leal -16(%ebx,%ecx,4), %ebx
! 139:
! 140: leal -16(%edi,%ecx,4), %edi
! 141: subl $4, %ecx
! 142:
! 143: negl %ecx
! 144:
! 145:
! 146: ALIGN(16) C one byte nop for this alignment
! 147: L(top):
! 148: C eax scratch
! 149: C ebx &src[size-4]
! 150: C ecx counter
! 151: C edx scratch
! 152: C esi carry
! 153: C edi &dst[size-4]
! 154: C ebp multiplier
! 155:
! 156: mull %ebp
! 157:
! 158: addl %esi, %eax
! 159: movl $0, %esi
! 160:
! 161: adcl %edx, %esi
! 162:
! 163: movl %eax, (%edi,%ecx,4)
! 164: movl 4(%ebx,%ecx,4), %eax
! 165:
! 166:
! 167: mull %ebp
! 168:
! 169: addl %esi, %eax
! 170: movl $0, %esi
! 171:
! 172: adcl %edx, %esi
! 173:
! 174: movl %eax, 4(%edi,%ecx,4)
! 175: movl 8(%ebx,%ecx,4), %eax
! 176:
! 177:
! 178: mull %ebp
! 179:
! 180: addl %esi, %eax
! 181: movl $0, %esi
! 182:
! 183: adcl %edx, %esi
! 184:
! 185: movl %eax, 8(%edi,%ecx,4)
! 186: movl 12(%ebx,%ecx,4), %eax
! 187:
! 188:
! 189: mull %ebp
! 190:
! 191: addl %esi, %eax
! 192: movl $0, %esi
! 193:
! 194: adcl %edx, %esi
! 195:
! 196: movl %eax, 12(%edi,%ecx,4)
! 197: movl 16(%ebx,%ecx,4), %eax
! 198:
! 199:
! 200: addl $4, %ecx
! 201: js L(top)
! 202:
! 203:
! 204:
! 205: C eax next src limb
! 206: C ebx &src[size-4]
! 207: C ecx 0 to 3 representing respectively 4 to 1 further limbs
! 208: C edx
! 209: C esi carry
! 210: C edi &dst[size-4]
! 211:
! 212: testb $2, %cl
! 213: jnz L(finish_not_two)
! 214:
! 215: mull %ebp
! 216:
! 217: addl %esi, %eax
! 218: movl $0, %esi
! 219:
! 220: adcl %edx, %esi
! 221:
! 222: movl %eax, (%edi,%ecx,4)
! 223: movl 4(%ebx,%ecx,4), %eax
! 224:
! 225:
! 226: mull %ebp
! 227:
! 228: addl %esi, %eax
! 229: movl $0, %esi
! 230:
! 231: adcl %edx, %esi
! 232:
! 233: movl %eax, 4(%edi,%ecx,4)
! 234: movl 8(%ebx,%ecx,4), %eax
! 235:
! 236: addl $2, %ecx
! 237: L(finish_not_two):
! 238:
! 239:
! 240: testb $1, %cl
! 241: jnz L(finish_not_one)
! 242:
! 243: mull %ebp
! 244:
! 245: addl %esi, %eax
! 246: movl $0, %esi
! 247:
! 248: adcl %edx, %esi
! 249:
! 250: movl %eax, 8(%edi)
! 251: movl 12(%ebx), %eax
! 252: L(finish_not_one):
! 253:
! 254:
! 255: mull %ebp
! 256:
! 257: addl %esi, %eax
! 258: popl %ebp
! 259:
! 260: adcl $0, %edx
! 261:
! 262: movl %eax, 12(%edi)
! 263: popl %edi
! 264:
! 265: popl %ebx
! 266: movl %edx, %eax
! 267:
! 268: popl %esi
! 269:
! 270: ret
! 271:
! 272: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>