=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/x86/k7/Attic/sqr_basecase.asm,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/x86/k7/Attic/sqr_basecase.asm 2000/09/09 14:12:42 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/x86/k7/Attic/sqr_basecase.asm 2003/08/25 16:06:29 1.1.1.2 @@ -1,11 +1,6 @@ dnl AMD K7 mpn_sqr_basecase -- square an mpn number. -dnl -dnl K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product -dnl (measured on the speed difference between 25 and 50 limbs, which is -dnl roughly the Karatsuba recursing range). - -dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -24,25 +19,29 @@ dnl License along with the GNU MP Library; see the fi dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. - include(`../config.m4') +C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product +C (measured on the speed difference between 25 and 50 limbs, which is +C roughly the Karatsuba recursing range). + + dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for dnl some comments. -deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66) +deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66) -ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', -`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') +ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE', +`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)') -m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') -deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) +m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD') +deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3)) C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); C -C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes, +C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes, C which is quite a bit, but is considered good value since squares big C enough to use most of the code will be spending quite a few cycles in it. @@ -51,7 +50,7 @@ defframe(PARAM_SIZE,12) defframe(PARAM_SRC, 8) defframe(PARAM_DST, 4) - .text + TEXT ALIGN(32) PROLOGUE(mpn_sqr_basecase) deflit(`FRAME',0) @@ -468,8 +467,8 @@ Zdisp( movl, disp_src,(%esi), %eax) mull %ebp -dnl Zdisp( addl %ebx, disp_src,(%edi)) - addl %ebx, disp_dst(%edi) +Zdisp( addl, %ebx, disp_dst,(%edi)) + ifelse(forloop_last,0, ` movl $0, %ebx')