Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/lshift.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel P5 mpn_lshift -- mpn left shift.
! 2: dnl
! 3: dnl P5: 1.75 cycles/limb.
! 4:
! 5:
! 6: dnl Copyright (C) 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 30: C unsigned shift);
! 31: C
! 32: C Shift src,size left by shift many bits and store the result in dst,size.
! 33: C Zeros are shifted in at the right. Return the bits shifted out at the
! 34: C left.
! 35: C
! 36: C The comments in mpn_rshift apply here too.
! 37:
! 38: defframe(PARAM_SHIFT,16)
! 39: defframe(PARAM_SIZE, 12)
! 40: defframe(PARAM_SRC, 8)
! 41: defframe(PARAM_DST, 4)
! 42: deflit(`FRAME',0)
! 43:
! 44: dnl minimum 5, because the unrolled loop can't handle less
! 45: deflit(UNROLL_THRESHOLD, 5)
! 46:
! 47: .text
! 48: ALIGN(8)
! 49:
! 50: PROLOGUE(mpn_lshift)
! 51:
! 52: pushl %ebx
! 53: pushl %edi
! 54: deflit(`FRAME',8)
! 55:
! 56: movl PARAM_SIZE, %eax
! 57: movl PARAM_DST, %edx
! 58:
! 59: movl PARAM_SRC, %ebx
! 60: movl PARAM_SHIFT, %ecx
! 61:
! 62: cmp $UNROLL_THRESHOLD, %eax
! 63: jae L(unroll)
! 64:
! 65: movl -4(%ebx,%eax,4), %edi C src high limb
! 66: decl %eax
! 67:
! 68: jnz L(simple)
! 69:
! 70: shldl( %cl, %edi, %eax) C eax was decremented to zero
! 71:
! 72: shll %cl, %edi
! 73:
! 74: movl %edi, (%edx) C dst low limb
! 75: popl %edi C risk of data cache bank clash
! 76:
! 77: popl %ebx
! 78:
! 79: ret
! 80:
! 81:
! 82: C -----------------------------------------------------------------------------
! 83: L(simple):
! 84: C eax size-1
! 85: C ebx src
! 86: C ecx shift
! 87: C edx dst
! 88: C esi
! 89: C edi
! 90: C ebp
! 91: deflit(`FRAME',8)
! 92:
! 93: movd (%ebx,%eax,4), %mm5 C src high limb
! 94:
! 95: movd %ecx, %mm6 C lshift
! 96: negl %ecx
! 97:
! 98: psllq %mm6, %mm5
! 99: addl $32, %ecx
! 100:
! 101: movd %ecx, %mm7
! 102: psrlq $32, %mm5 C retval
! 103:
! 104:
! 105: L(simple_top):
! 106: C eax counter, limbs, negative
! 107: C ebx src
! 108: C ecx
! 109: C edx dst
! 110: C esi
! 111: C edi
! 112: C
! 113: C mm0 scratch
! 114: C mm5 return value
! 115: C mm6 shift
! 116: C mm7 32-shift
! 117:
! 118: movq -4(%ebx,%eax,4), %mm0
! 119: decl %eax
! 120:
! 121: psrlq %mm7, %mm0
! 122:
! 123: C
! 124:
! 125: movd %mm0, 4(%edx,%eax,4)
! 126: jnz L(simple_top)
! 127:
! 128:
! 129: movd (%ebx), %mm0
! 130:
! 131: movd %mm5, %eax
! 132: psllq %mm6, %mm0
! 133:
! 134: popl %edi
! 135: popl %ebx
! 136:
! 137: movd %mm0, (%edx)
! 138:
! 139: emms
! 140:
! 141: ret
! 142:
! 143:
! 144: C -----------------------------------------------------------------------------
! 145: ALIGN(8)
! 146: L(unroll):
! 147: C eax size
! 148: C ebx src
! 149: C ecx shift
! 150: C edx dst
! 151: C esi
! 152: C edi
! 153: C ebp
! 154: deflit(`FRAME',8)
! 155:
! 156: movd -4(%ebx,%eax,4), %mm5 C src high limb
! 157: leal (%ebx,%eax,4), %edi
! 158:
! 159: movd %ecx, %mm6 C lshift
! 160: andl $4, %edi
! 161:
! 162: psllq %mm6, %mm5
! 163: jz L(start_src_aligned)
! 164:
! 165:
! 166: C src isn't aligned, process high limb separately (marked xxx) to
! 167: C make it so.
! 168: C
! 169: C source -8(ebx,%eax,4)
! 170: C |
! 171: C +-------+-------+-------+--
! 172: C | |
! 173: C +-------+-------+-------+--
! 174: C 0mod8 4mod8 0mod8
! 175: C
! 176: C dest
! 177: C -4(edx,%eax,4)
! 178: C |
! 179: C +-------+-------+--
! 180: C | xxx | |
! 181: C +-------+-------+--
! 182:
! 183: movq -8(%ebx,%eax,4), %mm0 C unaligned load
! 184:
! 185: psllq %mm6, %mm0
! 186: decl %eax
! 187:
! 188: psrlq $32, %mm0
! 189:
! 190: C
! 191:
! 192: movd %mm0, (%edx,%eax,4)
! 193: L(start_src_aligned):
! 194:
! 195: movq -8(%ebx,%eax,4), %mm1 C src high qword
! 196: leal (%edx,%eax,4), %edi
! 197:
! 198: andl $4, %edi
! 199: psrlq $32, %mm5 C return value
! 200:
! 201: movq -16(%ebx,%eax,4), %mm3 C src second highest qword
! 202: jz L(start_dst_aligned)
! 203:
! 204: C dst isn't aligned, subtract 4 to make it so, and pretend the shift
! 205: C is 32 bits extra. High limb of dst (marked xxx) handled here
! 206: C separately.
! 207: C
! 208: C source -8(ebx,%eax,4)
! 209: C |
! 210: C +-------+-------+--
! 211: C | mm1 |
! 212: C +-------+-------+--
! 213: C 0mod8 4mod8
! 214: C
! 215: C dest
! 216: C -4(edx,%eax,4)
! 217: C |
! 218: C +-------+-------+-------+--
! 219: C | xxx | |
! 220: C +-------+-------+-------+--
! 221: C 0mod8 4mod8 0mod8
! 222:
! 223: movq %mm1, %mm0
! 224: addl $32, %ecx C new shift
! 225:
! 226: psllq %mm6, %mm0
! 227:
! 228: movd %ecx, %mm6
! 229: psrlq $32, %mm0
! 230:
! 231: C wasted cycle here waiting for %mm0
! 232:
! 233: movd %mm0, -4(%edx,%eax,4)
! 234: subl $4, %edx
! 235: L(start_dst_aligned):
! 236:
! 237:
! 238: psllq %mm6, %mm1
! 239: negl %ecx C -shift
! 240:
! 241: addl $64, %ecx C 64-shift
! 242: movq %mm3, %mm2
! 243:
! 244: movd %ecx, %mm7
! 245: subl $8, %eax C size-8
! 246:
! 247: psrlq %mm7, %mm3
! 248:
! 249: por %mm1, %mm3 C mm3 ready to store
! 250: jc L(finish)
! 251:
! 252:
! 253: C The comments in mpn_rshift apply here too.
! 254:
! 255: ALIGN(8)
! 256: L(unroll_loop):
! 257: C eax counter, limbs
! 258: C ebx src
! 259: C ecx
! 260: C edx dst
! 261: C esi
! 262: C edi
! 263: C
! 264: C mm0
! 265: C mm1
! 266: C mm2 src qword from 48(%ebx,%eax,4)
! 267: C mm3 dst qword ready to store to 56(%edx,%eax,4)
! 268: C
! 269: C mm5 return value
! 270: C mm6 lshift
! 271: C mm7 rshift
! 272:
! 273: movq 8(%ebx,%eax,4), %mm0
! 274: psllq %mm6, %mm2
! 275:
! 276: movq %mm0, %mm1
! 277: psrlq %mm7, %mm0
! 278:
! 279: movq %mm3, 24(%edx,%eax,4) C prev
! 280: por %mm2, %mm0
! 281:
! 282: movq (%ebx,%eax,4), %mm3 C
! 283: psllq %mm6, %mm1 C
! 284:
! 285: movq %mm0, 16(%edx,%eax,4)
! 286: movq %mm3, %mm2 C
! 287:
! 288: psrlq %mm7, %mm3 C
! 289: subl $4, %eax
! 290:
! 291: por %mm1, %mm3 C
! 292: jnc L(unroll_loop)
! 293:
! 294:
! 295:
! 296: L(finish):
! 297: C eax -4 to -1 representing respectively 0 to 3 limbs remaining
! 298:
! 299: testb $2, %al
! 300:
! 301: jz L(finish_no_two)
! 302:
! 303: movq 8(%ebx,%eax,4), %mm0
! 304: psllq %mm6, %mm2
! 305:
! 306: movq %mm0, %mm1
! 307: psrlq %mm7, %mm0
! 308:
! 309: movq %mm3, 24(%edx,%eax,4) C prev
! 310: por %mm2, %mm0
! 311:
! 312: movq %mm1, %mm2
! 313: movq %mm0, %mm3
! 314:
! 315: subl $2, %eax
! 316: L(finish_no_two):
! 317:
! 318:
! 319: C eax -4 or -3 representing respectively 0 or 1 limbs remaining
! 320: C
! 321: C mm2 src prev qword, from 48(%ebx,%eax,4)
! 322: C mm3 dst qword, for 56(%edx,%eax,4)
! 323:
! 324: testb $1, %al
! 325: movd %mm5, %eax C retval
! 326:
! 327: popl %edi
! 328: jz L(finish_zero)
! 329:
! 330:
! 331: C One extra src limb, destination was aligned.
! 332: C
! 333: C source ebx
! 334: C --+---------------+-------+
! 335: C | mm2 | |
! 336: C --+---------------+-------+
! 337: C
! 338: C dest edx+12 edx+4 edx
! 339: C --+---------------+---------------+-------+
! 340: C | mm3 | | |
! 341: C --+---------------+---------------+-------+
! 342: C
! 343: C mm6 = shift
! 344: C mm7 = ecx = 64-shift
! 345:
! 346:
! 347: C One extra src limb, destination was unaligned.
! 348: C
! 349: C source ebx
! 350: C --+---------------+-------+
! 351: C | mm2 | |
! 352: C --+---------------+-------+
! 353: C
! 354: C dest edx+12 edx+4
! 355: C --+---------------+---------------+
! 356: C | mm3 | |
! 357: C --+---------------+---------------+
! 358: C
! 359: C mm6 = shift+32
! 360: C mm7 = ecx = 64-(shift+32)
! 361:
! 362:
! 363: C In both cases there's one extra limb of src to fetch and combine
! 364: C with mm2 to make a qword at 4(%edx), and in the aligned case
! 365: C there's an extra limb of dst to be formed from that extra src limb
! 366: C left shifted.
! 367:
! 368:
! 369: movd (%ebx), %mm0
! 370: psllq %mm6, %mm2
! 371:
! 372: movq %mm3, 12(%edx)
! 373: psllq $32, %mm0
! 374:
! 375: movq %mm0, %mm1
! 376: psrlq %mm7, %mm0
! 377:
! 378: por %mm2, %mm0
! 379: psllq %mm6, %mm1
! 380:
! 381: movq %mm0, 4(%edx)
! 382: psrlq $32, %mm1
! 383:
! 384: andl $32, %ecx
! 385: popl %ebx
! 386:
! 387: jz L(finish_one_unaligned)
! 388:
! 389: movd %mm1, (%edx)
! 390: L(finish_one_unaligned):
! 391:
! 392: emms
! 393:
! 394: ret
! 395:
! 396:
! 397: L(finish_zero):
! 398:
! 399: C No extra src limbs, destination was aligned.
! 400: C
! 401: C source ebx
! 402: C --+---------------+
! 403: C | mm2 |
! 404: C --+---------------+
! 405: C
! 406: C dest edx+8 edx
! 407: C --+---------------+---------------+
! 408: C | mm3 | |
! 409: C --+---------------+---------------+
! 410: C
! 411: C mm6 = shift
! 412: C mm7 = ecx = 64-shift
! 413:
! 414:
! 415: C No extra src limbs, destination was unaligned.
! 416: C
! 417: C source ebx
! 418: C --+---------------+
! 419: C | mm2 |
! 420: C --+---------------+
! 421: C
! 422: C dest edx+8 edx+4
! 423: C --+---------------+-------+
! 424: C | mm3 | |
! 425: C --+---------------+-------+
! 426: C
! 427: C mm6 = shift+32
! 428: C mm7 = ecx = 64-(shift+32)
! 429:
! 430:
! 431: C The movd for the unaligned case writes the same data to 4(%edx)
! 432: C that the movq does for the aligned case.
! 433:
! 434:
! 435: movq %mm3, 8(%edx)
! 436: andl $32, %ecx
! 437:
! 438: psllq %mm6, %mm2
! 439: jz L(finish_zero_unaligned)
! 440:
! 441: movq %mm2, (%edx)
! 442: L(finish_zero_unaligned):
! 443:
! 444: psrlq $32, %mm2
! 445: popl %ebx
! 446:
! 447: movd %mm5, %eax C retval
! 448:
! 449: movd %mm2, 4(%edx)
! 450:
! 451: emms
! 452:
! 453: ret
! 454:
! 455: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>