version 1.1.1.1, 2000/09/09 14:12:42 |
version 1.1.1.2, 2003/08/25 16:06:28 |
|
|
dnl AMD K6 mpn_sqr_basecase -- square an mpn number. |
dnl AMD K6 mpn_sqr_basecase -- square an mpn number. |
dnl |
|
dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular |
|
dnl product (measured on the speed difference between 17 and 33 limbs, |
|
dnl which is roughly the Karatsuba recursing range). |
|
|
|
|
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. |
|
dnl |
dnl |
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
dnl |
dnl |
Line 24 dnl License along with the GNU MP Library; see the fi |
|
Line 19 dnl License along with the GNU MP Library; see the fi |
|
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl Suite 330, Boston, MA 02111-1307, USA. |
dnl Suite 330, Boston, MA 02111-1307, USA. |
|
|
|
|
include(`../config.m4') |
include(`../config.m4') |
|
|
|
|
dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this |
C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular |
|
C product (measured on the speed difference between 17 and 33 limbs, |
|
C which is roughly the Karatsuba recursing range). |
|
|
|
|
|
dnl SQR_KARATSUBA_THRESHOLD_MAX is the maximum SQR_KARATSUBA_THRESHOLD this |
dnl code supports. This value is used only by the tune program to know |
dnl code supports. This value is used only by the tune program to know |
dnl what it can go up to. (An attempt to compile with a bigger value will |
dnl what it can go up to. (An attempt to compile with a bigger value will |
dnl trigger some m4_assert()s in the code, making the build fail.) |
dnl trigger some m4_assert()s in the code, making the build fail.) |
dnl |
dnl |
dnl The value is determined by requiring the displacements in the unrolled |
dnl The value is determined by requiring the displacements in the unrolled |
dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of |
dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of |
dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66. |
dnl 63, giving a maximum SQR_KARATSUBA_THRESHOLD of 66. |
|
|
deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66) |
deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66) |
|
|
|
|
dnl Allow a value from the tune program to override config.m4. |
dnl Allow a value from the tune program to override config.m4. |
|
|
ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', |
ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE', |
`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') |
`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)') |
|
|
|
|
dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The |
dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The |
dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since |
dnl number required is determined by SQR_KARATSUBA_THRESHOLD, since |
dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD. |
dnl mpn_sqr_basecase only needs to handle sizes < SQR_KARATSUBA_THRESHOLD. |
dnl |
dnl |
dnl The first addmul is the biggest, and this takes the second least |
dnl The first addmul is the biggest, and this takes the second least |
dnl significant limb and multiplies it by the third least significant and |
dnl significant limb and multiplies it by the third least significant and |
dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1 |
dnl up. Hence for a maximum operand size of SQR_KARATSUBA_THRESHOLD-1 |
dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3. |
dnl limbs, UNROLL_COUNT needs to be SQR_KARATSUBA_THRESHOLD-3. |
|
|
m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') |
m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD') |
deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) |
deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3)) |
|
|
|
|
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); |
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); |
Line 81 defframe(PARAM_SIZE,12) |
|
Line 80 defframe(PARAM_SIZE,12) |
|
defframe(PARAM_SRC, 8) |
defframe(PARAM_SRC, 8) |
defframe(PARAM_DST, 4) |
defframe(PARAM_DST, 4) |
|
|
.text |
TEXT |
ALIGN(32) |
ALIGN(32) |
PROLOGUE(mpn_sqr_basecase) |
PROLOGUE(mpn_sqr_basecase) |
deflit(`FRAME',0) |
deflit(`FRAME',0) |
|
|
C ----------------------------------------------------------------------------- |
C ----------------------------------------------------------------------------- |
ifdef(`PIC',` |
ifdef(`PIC',` |
L(pic_calc): |
L(pic_calc): |
C See README.family about old gas bugs |
C See mpn/x86/README about old gas bugs |
addl (%esp), %ecx |
addl (%esp), %ecx |
addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx |
addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx |
addl %edx, %ecx |
addl %edx, %ecx |