Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel Pentium-4 mpn_sqr_basecase -- square an mpn number.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C P4: approx 3.5 cycles per crossproduct, or 7 cycles per triangular
! 26: C product, at around 30x30 limbs.
! 27:
! 28:
! 29: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
! 30: C
! 31: C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
! 32: C lot of function call overheads are avoided, especially when the size is
! 33: C small.
! 34: C
! 35: C On small sizes there's only a small speedup over mpn_mul_basecase,
! 36: C presumably branch mispredictions are a bigger fraction of the work done.
! 37: C It's not clear how to help this.
! 38:
! 39: defframe(PARAM_SIZE,12)
! 40: defframe(PARAM_SRC, 8)
! 41: defframe(PARAM_DST, 4)
! 42:
! 43: TEXT
! 44: ALIGN(8)
! 45: PROLOGUE(mpn_sqr_basecase)
! 46: deflit(`FRAME',0)
! 47:
! 48: movl PARAM_SIZE, %edx
! 49: movl PARAM_SRC, %eax
! 50: movl PARAM_DST, %ecx
! 51:
! 52: cmpl $2, %edx
! 53:
! 54: je L(two_limbs)
! 55: ja L(three_or_more)
! 56:
! 57: C -----------------------------------------------------------------------------
! 58: C one limb only
! 59: C eax src
! 60: C ebx
! 61: C ecx dst
! 62: C edx
! 63:
! 64: movl (%eax), %eax
! 65: mull %eax
! 66:
! 67: movl %eax, (%ecx)
! 68: movl %edx, 4(%ecx)
! 69:
! 70: ret
! 71:
! 72: C -----------------------------------------------------------------------------
! 73: L(two_limbs):
! 74: C eax src
! 75: C ebx
! 76: C ecx dst
! 77: C edx size
! 78:
! 79: movd (%eax), %mm1
! 80: movd 4(%eax), %mm0
! 81: pmuludq %mm1, %mm0 C src[0]*src[1]
! 82:
! 83: pmuludq %mm1, %mm1 C src[0]^2
! 84:
! 85: movd 4(%eax), %mm2
! 86: pmuludq %mm2, %mm2 C src[1]^2
! 87:
! 88: movd %mm1, (%ecx) C dst[0]
! 89: psrlq $32, %mm1
! 90:
! 91: pcmpeqd %mm3, %mm3
! 92: psrlq $32, %mm3 C 0x00000000FFFFFFFF
! 93: pand %mm0, %mm3 C low(src[0]*src[1])
! 94: psrlq $32, %mm0 C high(src[0]*src[1])
! 95:
! 96: psllq $1, %mm3 C 2*low(src[0]*src[1])
! 97: paddq %mm3, %mm1 C high(src[0]^2)
! 98: movd %mm1, 4(%ecx) C dst[1]
! 99:
! 100: pcmpeqd %mm4, %mm4
! 101: psrlq $32, %mm4 C 0x00000000FFFFFFFF
! 102: pand %mm2, %mm4 C low(src[1]^2)
! 103: psrlq $32, %mm2 C high(src[1]^2)
! 104:
! 105: psllq $1, %mm0 C 2*high(src[0]*src[1])
! 106: psrlq $32, %mm1 C carry
! 107: paddq %mm1, %mm0
! 108: paddq %mm4, %mm0 C low(src[1]^2)
! 109: movd %mm0, 8(%ecx) C dst[2]
! 110:
! 111: psrlq $32, %mm0 C carry
! 112: paddq %mm2, %mm0 C high(src[1]^2)
! 113: movd %mm0, 12(%ecx) C dst[3]
! 114:
! 115: ASSERT(z,`
! 116: psrlq $32, %mm0
! 117: movd %mm0, %eax
! 118: orl %eax, %eax')
! 119:
! 120: emms
! 121: ret
! 122:
! 123:
! 124: C -----------------------------------------------------------------------------
! 125: L(three_or_more):
! 126:
! 127: C eax src
! 128: C ebx
! 129: C ecx dst
! 130: C edx size
! 131: C esi
! 132: C edi
! 133: C ebp
! 134: C
! 135: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
! 136:
! 137: defframe(SAVE_ESI, -4)
! 138: defframe(SAVE_EDI, -8)
! 139: defframe(SAVE_EBP, -12)
! 140: deflit(STACK_SPACE, 12)
! 141:
! 142: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
! 143: pxor %mm0, %mm0 C initial carry
! 144: movd (%eax), %mm7 C multiplier
! 145:
! 146: movl %esi, SAVE_ESI
! 147: movl %edi, SAVE_EDI
! 148: movl %ebp, SAVE_EBP
! 149:
! 150:
! 151: movl %eax, %esi
! 152: movl %ecx, %edi
! 153: subl $1, %edx
! 154:
! 155: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
! 156: L(mul1):
! 157: C eax src, incrementing
! 158: C ebx
! 159: C ecx dst, incrementing
! 160: C edx counter, size-1 iterations
! 161: C esi src
! 162: C edi dst
! 163: C ebp
! 164: C
! 165: C mm0 carry limb
! 166: C mm7 multiplier
! 167:
! 168: movd 4(%eax), %mm1
! 169: addl $4, %eax
! 170: pmuludq %mm7, %mm1
! 171: paddq %mm1, %mm0
! 172: movd %mm0, 4(%ecx)
! 173: addl $4, %ecx
! 174: psrlq $32, %mm0
! 175: subl $1, %edx
! 176: jnz L(mul1)
! 177:
! 178:
! 179: movl PARAM_SIZE, %ebp
! 180: subl $3, %ebp
! 181: jz L(corner)
! 182:
! 183:
! 184: C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
! 185: C n=1..size-2. The last two products, which are the end corner of
! 186: C the product triangle, are handled separately to save looping
! 187: C overhead.
! 188:
! 189: L(outer):
! 190: C eax
! 191: C ebx
! 192: C ecx
! 193: C edx
! 194: C esi src, incrementing
! 195: C edi dst, incrementing
! 196: C ebp size, decrementing
! 197: C
! 198: C mm0 prev carry
! 199:
! 200: movd 4(%esi), %mm7 C multiplier
! 201: movd %mm0, 4(%ecx) C prev carry
! 202:
! 203: leal 8(%esi), %eax C next src
! 204: addl $4, %esi
! 205:
! 206: leal 8(%edi), %ecx C next dst
! 207: addl $8, %edi
! 208:
! 209: leal 1(%ebp), %edx C counter
! 210:
! 211: pxor %mm0, %mm0 C initial carry limb, clear carry flag
! 212:
! 213: L(inner):
! 214: C eax src, incrementing
! 215: C edx
! 216: C ecx dst, incrementing
! 217: C edx counter
! 218: C esi outer src
! 219: C edi outer dst
! 220: C ebp outer size
! 221: C
! 222: C mm0 carry
! 223:
! 224: movd (%eax), %mm1
! 225: leal 4(%eax), %eax
! 226: movd 4(%ecx),%mm2
! 227: pmuludq %mm7, %mm1
! 228: paddq %mm2, %mm1
! 229: paddq %mm1, %mm0
! 230: subl $1, %edx
! 231: movd %mm0, 4(%ecx)
! 232: psrlq $32, %mm0
! 233: leal 4(%ecx), %ecx
! 234: jnz L(inner)
! 235:
! 236: subl $1, %ebp
! 237: jnz L(outer)
! 238:
! 239:
! 240: L(corner):
! 241: C esi &src[size-3]
! 242: C edi &dst[2*size-6]
! 243: C mm0 carry
! 244: C
! 245: C +-----+-----+--
! 246: C | mm0 | dst so far
! 247: C +-----+-----+--
! 248: C +-----+-----+
! 249: C | | | src[size-2]*src[size-1]
! 250: C +-----+-----+
! 251:
! 252: movd 4(%esi), %mm1
! 253: movd 8(%esi), %mm2
! 254: pmuludq %mm2, %mm1 C src[size-1]*src[size-2]
! 255:
! 256: movl PARAM_SRC, %eax
! 257: movd (%eax), %mm2
! 258: pmuludq %mm2, %mm2 C src[0]^2
! 259:
! 260: pcmpeqd %mm7, %mm7
! 261: psrlq $32, %mm7
! 262:
! 263: movl PARAM_DST, %edx
! 264: movd 4(%edx), %mm3 C dst[1]
! 265:
! 266: paddq %mm1, %mm0
! 267: movd %mm0, 12(%edi) C dst[2*size-3]
! 268:
! 269: psrlq $32, %mm0
! 270: movd %mm0, 16(%edi) C dst[2*size-2]
! 271:
! 272: movd %mm2, (%edx) C dst[0]
! 273: psrlq $32, %mm2
! 274:
! 275: psllq $1, %mm3 C 2*dst[1]
! 276: paddq %mm3, %mm2
! 277: movd %mm2, 4(%edx)
! 278: psrlq $32, %mm2
! 279:
! 280: movl PARAM_SIZE, %ecx
! 281: subl $2, %ecx
! 282:
! 283: C Now form squares on the diagonal src[0]^2,...,src[size-1]^2, and
! 284: C add to the triangular parts dst[1..2*size-2] with those left
! 285: C shifted by 1 bit.
! 286:
! 287: L(diag):
! 288: C eax src, incrementing
! 289: C ebx
! 290: C ecx counter, size-2 iterations
! 291: C edx dst, incrementing
! 292: C esi
! 293: C edi
! 294: C ebp
! 295: C
! 296: C mm2 carry
! 297: C mm7 0x00000000FFFFFFFF
! 298:
! 299: movd 4(%eax), %mm0 C src limb
! 300: addl $4, %eax
! 301: pmuludq %mm0, %mm0
! 302: movq %mm7, %mm1
! 303: pand %mm0, %mm1 C diagonal low
! 304: psrlq $32, %mm0 C diagonal high
! 305:
! 306: movd 8(%edx), %mm3
! 307: psllq $1, %mm3 C 2*dst[i]
! 308: paddq %mm3, %mm1
! 309: paddq %mm1, %mm2
! 310: movd %mm2, 8(%edx)
! 311: psrlq $32, %mm2
! 312:
! 313: movd 12(%edx), %mm3
! 314: psllq $1, %mm3 C 2*dst[i+1]
! 315: paddq %mm3, %mm0
! 316: paddq %mm0, %mm2
! 317: movd %mm2, 12(%edx)
! 318: addl $8, %edx
! 319: psrlq $32, %mm2
! 320:
! 321: subl $1, %ecx
! 322: jnz L(diag)
! 323:
! 324:
! 325: movd 4(%eax), %mm0 C src[size-1]
! 326: pmuludq %mm0, %mm0
! 327: pand %mm0, %mm7 C diagonal low
! 328: psrlq $32, %mm0 C diagonal high
! 329:
! 330: movd 8(%edx), %mm3 C dst[2*size-2]
! 331: psllq $1, %mm3
! 332: paddq %mm3, %mm7
! 333: paddq %mm7, %mm2
! 334: movd %mm2, 8(%edx)
! 335: psrlq $32, %mm2
! 336:
! 337: paddq %mm0, %mm2
! 338: movd %mm2, 12(%edx) C dst[2*size-1]
! 339:
! 340: ASSERT(z,` C no further carry
! 341: psrlq $32, %mm2
! 342: movd %mm2, %eax
! 343: orl %eax, %eax')
! 344:
! 345:
! 346: movl SAVE_ESI, %esi
! 347: movl SAVE_EDI, %edi
! 348: movl SAVE_EBP, %ebp
! 349: addl $STACK_SPACE, %esp
! 350: emms
! 351: ret
! 352:
! 353: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>