version 1.1.1.1, 2000/09/09 14:12:42 |
version 1.1.1.2, 2003/08/25 16:06:29 |
|
|
dnl AMD K7 mpn_mod_1 -- mpn by limb remainder. |
dnl AMD K7 mpn_mod_1 -- mpn by limb remainder. |
dnl |
|
dnl K7: 17.0 cycles/limb. |
|
|
|
|
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. |
|
dnl |
dnl |
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
dnl |
dnl |
Line 22 dnl License along with the GNU MP Library; see the fi |
|
Line 19 dnl License along with the GNU MP Library; see the fi |
|
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl Suite 330, Boston, MA 02111-1307, USA. |
dnl Suite 330, Boston, MA 02111-1307, USA. |
|
|
|
|
include(`../config.m4') |
include(`../config.m4') |
|
|
|
|
|
C K7: 17.0 cycles/limb. |
|
|
|
|
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); |
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); |
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, |
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, |
C mp_limb_t carry); |
C mp_limb_t carry); |
|
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, |
|
C mp_limb_t inverse); |
C |
C |
C The code here is the same as mpn_divrem_1, but with the quotient |
C The code here is the same as mpn_divrem_1, but with the quotient |
C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments. |
C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments. |
Line 46 dnl is set to 4 to get the smaller div code used at 3 |
|
Line 47 dnl is set to 4 to get the smaller div code used at 3 |
|
deflit(MUL_THRESHOLD, 4) |
deflit(MUL_THRESHOLD, 4) |
|
|
|
|
defframe(PARAM_CARRY, 16) |
defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1 |
|
defframe(PARAM_CARRY, 16) dnl mpn_mod_1c |
defframe(PARAM_DIVISOR,12) |
defframe(PARAM_DIVISOR,12) |
defframe(PARAM_SIZE, 8) |
defframe(PARAM_SIZE, 8) |
defframe(PARAM_SRC, 4) |
defframe(PARAM_SRC, 4) |
Line 62 defframe(VAR_SRC_STOP,-28) |
|
Line 64 defframe(VAR_SRC_STOP,-28) |
|
|
|
deflit(STACK_SPACE, 28) |
deflit(STACK_SPACE, 28) |
|
|
.text |
TEXT |
|
|
ALIGN(32) |
ALIGN(32) |
|
PROLOGUE(mpn_preinv_mod_1) |
|
deflit(`FRAME',0) |
|
movl PARAM_SRC, %ecx |
|
movl PARAM_SIZE, %eax |
|
subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) |
|
|
|
movl %ebp, SAVE_EBP |
|
movl PARAM_DIVISOR, %ebp |
|
|
|
movl %edi, SAVE_EDI |
|
movl PARAM_INVERSE, %edx |
|
|
|
movl %esi, SAVE_ESI |
|
movl -4(%ecx,%eax,4), %edi C src high limb |
|
leal -16(%ecx,%eax,4), %ecx C &src[size-4] |
|
|
|
movl %ebx, SAVE_EBX |
|
movl PARAM_INVERSE, %edx |
|
|
|
movl $0, VAR_NORM C l==0 |
|
|
|
movl %edi, %esi |
|
subl %ebp, %edi C high-divisor |
|
|
|
cmovc( %esi, %edi) C restore if underflow |
|
decl %eax |
|
jz L(done_edi) C size==1, high-divisor only |
|
|
|
movl 8(%ecx), %esi C src second high limb |
|
movl %edx, VAR_INVERSE |
|
|
|
movl $32, %ebx C 32-l |
|
decl %eax |
|
jz L(inverse_one_left) C size==2, one divide |
|
|
|
movd %ebx, %mm7 C 32-l |
|
decl %eax |
|
jz L(inverse_two_left) C size==3, two divides |
|
|
|
jmp L(inverse_top) C size>=4 |
|
|
|
|
|
L(done_edi): |
|
movl SAVE_ESI, %esi |
|
movl SAVE_EBP, %ebp |
|
movl %edi, %eax |
|
|
|
movl SAVE_EDI, %edi |
|
addl $STACK_SPACE, %esp |
|
|
|
ret |
|
|
|
EPILOGUE() |
|
|
|
|
|
ALIGN(32) |
PROLOGUE(mpn_mod_1c) |
PROLOGUE(mpn_mod_1c) |
deflit(`FRAME',0) |
deflit(`FRAME',0) |
movl PARAM_CARRY, %edx |
movl PARAM_CARRY, %edx |
Line 77 deflit(`FRAME',STACK_SPACE) |
|
Line 135 deflit(`FRAME',STACK_SPACE) |
|
|
|
movl %esi, SAVE_ESI |
movl %esi, SAVE_ESI |
movl PARAM_SRC, %esi |
movl PARAM_SRC, %esi |
jmp LF(mpn_mod_1,start_1c) |
jmp L(start_1c) |
|
|
EPILOGUE() |
EPILOGUE() |
|
|
Line 176 L(mul_by_inverse): |
|
Line 234 L(mul_by_inverse): |
|
bsrl %ebp, %eax C 31-l |
bsrl %ebp, %eax C 31-l |
|
|
movl %ebx, SAVE_EBX |
movl %ebx, SAVE_EBX |
leal -4(%esi), %ebx |
movl %ecx, %ebx C size |
|
|
movl %ebx, VAR_SRC_STOP |
|
movl %edi, SAVE_EDI |
movl %edi, SAVE_EDI |
|
|
movl %ecx, %ebx C size |
|
movl $31, %ecx |
movl $31, %ecx |
|
|
movl %edx, %edi C carry |
movl %edx, %edi C carry |
Line 195 L(mul_by_inverse): |
|
Line 250 L(mul_by_inverse): |
|
shll %cl, %ebp C d normalized |
shll %cl, %ebp C d normalized |
movl %ecx, VAR_NORM |
movl %ecx, VAR_NORM |
|
|
movd %eax, %mm7 |
movd %eax, %mm7 C 32-l |
|
|
movl $-1, %eax |
movl $-1, %eax |
subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 |
subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 |
|
|
C |
C |
|
|
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag |
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag |
leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) |
leal 1(%edi), %ebx C n2+1 |
movl %ebp, %eax C d |
movl %ebp, %eax C d |
|
|
C |
C |
|
|
mull %ebx C (q1+1)*d |
mull %ebx C (q1+1)*d |
|
|
psrlq %mm7, %mm0 |
psrlq %mm7, %mm0 |
leal 0(%ecx), %ecx C dummy |
leal (%ecx), %ecx C dummy |
|
|
C |
C |
|
|
C |
C |
|
|
subl %eax, %esi |
subl %eax, %esi C low n - (q1+1)*d |
movl VAR_SRC_STOP, %eax |
movl PARAM_SRC, %eax |
|
|
C |
C |
|
|
sbbl %edx, %edi C n - (q1+1)*d |
sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1 |
movl %esi, %edi C remainder -> n2 |
movl %esi, %edi C remainder -> n2 |
leal (%ebp,%esi), %edx |
leal (%ebp,%esi), %edx |
|
|
|
|
|
|
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 |
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 |
cmpl %eax, %ecx |
cmpl %eax, %ecx |
jne L(inverse_top) |
jae L(inverse_top) |
|
|
|
|
L(inverse_loop_done): |
L(inverse_loop_done): |
Line 337 L(inverse_two_left): |
|
Line 392 L(inverse_two_left): |
|
C |
C |
|
|
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag |
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag |
leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) |
leal 1(%edi), %ebx C n2+1 |
movl %ebp, %eax C d |
movl %ebp, %eax C d |
|
|
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 |
adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 |
Line 365 L(inverse_two_left): |
|
Line 420 L(inverse_two_left): |
|
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 |
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 |
|
|
|
|
C One limb left |
L(inverse_one_left): |
|
|
C eax scratch |
C eax scratch |
C ebx scratch (nadj, q1) |
C ebx scratch (nadj, q1) |
C ecx |
C ecx |
|
|
C |
C |
|
|
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag |
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag |
leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) |
leal 1(%edi), %ebx C n2+1 |
movl %ebp, %eax C d |
movl %ebp, %eax C d |
|
|
C |
C |
|
|
C edi (n2) |
C edi (n2) |
C ebp divisor |
C ebp divisor |
|
|
movl VAR_SRC_STOP, %edx |
movl PARAM_SRC, %edx |
leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 |
leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 |
psrlq %mm7, %mm0 |
psrlq %mm7, %mm0 |
|
|
movd %mm0, %esi C next n10 |
movd %mm0, %esi C next n10 |
|
|
cmpl %ecx, %edx |
cmpl %edx, %ecx |
jne L(inverse_top) |
jae L(inverse_top) |
jmp L(inverse_loop_done) |
jmp L(inverse_loop_done) |
|
|
EPILOGUE() |
EPILOGUE() |