#8 Enable AVX2
Merged 5 months ago by dcantrell. Opened 9 months ago by tpiepho.
rpms/ tpiepho/fftw enable-avx2  into  rawhide

file modified
+57 -48
@@ -13,7 +13,7 @@ 

  

  Name:           fftw

  Version:        3.3.10

- Release:        5%{?dist}

+ Release:        6%{?dist}

  Summary:        A Fast Fourier Transform library

  License:        GPLv2+

  URL:            http://www.fftw.org
@@ -28,6 +28,16 @@ 

  %global quad 1

  %endif

  

+ # Names of precisions to (maybe) build

+ %global prec_names prec_name[0]=single;prec_name[1]=double;prec_name[2]=long;prec_name[3]=quad

+ # Number of precisions to build; sometimes quad is not possible

+ %global nprec 3

+ %if %{quad}

+ %global nprec 4

+ %endif

+ # Number of precisions to build for MPI

+ %global nmpiprec 3

+ 

  # For check phase

  BuildRequires:  time

  BuildRequires:  perl-interpreter
@@ -277,11 +287,7 @@ 

  BASEFLAGS="--enable-shared --disable-dependency-tracking --enable-threads"

  BASEFLAGS+=" --enable-openmp"

  

- # Precisions to build

- prec_name[0]=single

- prec_name[1]=double

- prec_name[2]=long

- prec_name[3]=quad

+ %prec_names

  

  # Corresponding flags

  prec_flags[0]=--enable-single
@@ -292,31 +298,27 @@ 

  %ifarch x86_64

  # Enable SSE2 and AVX support for x86_64

  for ((i=0; i<2; i++)) ; do

-     prec_flags[i]+=" --enable-sse2 --enable-avx"

+     prec_flags[i]+=" --enable-sse2 --enable-avx --enable-avx2"

  done

  %endif

  

- # No NEON run time detection, not all ARM SoCs have NEON

- #%ifarch %{arm}

- ## Compile support for NEON instructions

- #for ((i=0; i<2; i++)) ; do

- #    prec_flags[i]+=" --enable-neon"

- #done

- #%endif

- 

- #%ifarch ppc ppc64

- ## Compile support for Altivec instructions

- #for ((i=0; i<2; i++)) ; do

- #    prec_flags[i]+=" --enable-altivec"

- #done

- #%endif

+ %ifarch %{arm64}

+ # Compile support for NEON instructions

+ for ((i=0; i<2; i++)) ; do

+     prec_flags[i]+=" --enable-neon"

+ done

+ BASEFLAGS+=" --enable-armv8-cntvct-el0"

+ %endif

  

- # Loop over precisions

- %if %{quad}

- for ((iprec=0; iprec<4; iprec++)) ; do

- %else

- for ((iprec=0; iprec<3; iprec++)) ; do

+ %ifarch ppc ppc64

+ # Compile support for Altivec instructions; only supported for single precision

+ for ((i=0; i<1; i++)) ; do

+     prec_flags[i]+=" --enable-altivec"

+ done

  %endif

+ 

+ # Loop over precisions

+ for ((iprec=0; iprec<%{nprec}; iprec++)) ; do

      mkdir ${prec_name[iprec]}${ver_name[iver]}

      cd ${prec_name[iprec]}${ver_name[iver]}

      ln -s ../configure .
@@ -328,16 +330,19 @@ 

  done

  

  # MPI Builds - this duplicates the non-mpi builds, but oh well

- for mpi in %{mpi_list} ; do

+ for mpi in %{?mpi_list} ; do

      module load mpi/${mpi}-%{_arch}

      # Loop over precisions - no quad precision support with MPI

-     for((iprec=0;iprec<3;iprec++)) ; do

+     for((iprec=0;iprec<%{nmpiprec};iprec++)) ; do

          mkdir ${mpi}-${prec_name[iprec]}${ver_name[iver]}

          cd ${mpi}-${prec_name[iprec]}${ver_name[iver]}

          ln -s ../configure .

          # Force linking the _mpi.so libraries with the mpi libs.  This works because

          # we get rid of all of the non-mpi components of these builds

          export CC=mpicc

+         if [ $mpi = "openmpi" ]; then

+             export MPIRUN="mpirun --oversubscribe"

+         fi

          %{configure} ${BASEFLAGS} ${prec_flags[iprec]} \

              --enable-mpi \

              --libdir=%{_libdir}/$mpi/lib \
@@ -352,23 +357,21 @@ 

  done

  

  %install

+ %prec_names

+ 

  # Explicitly load shell support for the environment-modules package, used

  # below via 'module' pseudo-command.

  source /etc/profile.d/modules.sh

  

- %if %{quad}

- for ver in single double long quad ; do

- %else

- for ver in single double long ; do

- %endif

-     %make_install -C $ver

+ for((iprec=0;iprec<%{nprec};iprec++)) ; do

+     %make_install -C ${prec_name[iprec]}

  done

  

  # MPI

- for mpi in %{mpi_list} ; do

+ for mpi in %{?mpi_list} ; do

      module load mpi/${mpi}-%{_arch}

-     for ver in single double long ; do

-         %make_install -C ${mpi}-${ver}

+     for((iprec=0;iprec<%{nmpiprec};iprec++)) ; do

+         %make_install -C ${mpi}-${prec_name[iprec]}

          # Remove duplicated non-mpi libraries, binaries, and data

          find %{buildroot}%{_libdir}/${mpi}/lib -name libfftw\* -a \! -name \*_mpi.\* -delete

          rm -r %{buildroot}%{_libdir}/${mpi}/{bin,share}
@@ -380,28 +383,25 @@ 

  find %{buildroot} -name \*.la -delete

  

  %check

+ %prec_names

  # Explicitly load shell support for the environment-modules package, used

  # below via 'module' pseudo-command.

  . /etc/profile.d/modules.sh

  

  bdir=$(pwd)

- %if %{quad}

- for ver in single double long quad ; do

- %else

- for ver in single double long ; do

- %endif

-     export LD_LIBRARY_PATH=$bdir/$ver/.libs:$bdir/$ver/threads/.libs

-     %make_build -C $ver check

+ for((iprec=0;iprec<%{nprec};iprec++)) ; do

+     export LD_LIBRARY_PATH=$bdir/${prec_name[iprec]}/.libs:$bdir/${prec_name[iprec]}/threads/.libs

+     %make_build -C ${prec_name[iprec]} check

  done

  

  # MPI

  # Allow oversubscription with openmpi

  export OMPI_MCA_rmaps_base_oversubscribe=1

- for mpi in %{mpi_list} ; do

+ for mpi in %{?mpi_list} ; do

      module load mpi/${mpi}-%{_arch}

-     for ver in single double long ; do

-         export LD_LIBRARY_PATH=$bdir/$ver/.libs:$bdir/$ver/threads/.libs

-         %make_build -C ${mpi}-${ver}/mpi check

+     for((iprec=0;iprec<%{nmpiprec};iprec++)) ; do

+         export LD_LIBRARY_PATH=$bdir/${prec_name[iprec]}/.libs:$bdir/${prec_name[iprec]}/threads/.libs

+         %make_build -C ${mpi}-${prec_name[iprec]}/mpi check

      done

      module unload mpi/${mpi}-%{_arch}

  done
@@ -525,6 +525,15 @@ 

  %endif

  

  %changelog

+ * Mon Mar 27 2023 Trent Piepho <tpiepho@gmail.com> - 3.3.10-6

+ - Enable AVX2 on x86-86

+ - Enable NEON on aarch64

+ - Clean up precision list

+ - Fix for OpenMPI build with < 4 processors

+ - Fix building with no enabled MPI types

+ - Enable single precision Altivec on PPC

+ - Enable CNTVCT_EL0 support on ARMv8

+ 

  * Thu Mar 02 2023 Orion Poplawski <orion@nwra.com> - 3.3.10-5

  - Use make macros

  - Drop openmpi vader workaround

FFTW does runtime detection of CPU features on x86, so this change does
not require AVX2 support to use the libraries, but does allow FFTW to
use the AVX2 code when the CPU supports it.

rebased onto 036b87c

9 months ago

OK, according to https://www.fftw.org/fftw3_doc/Installation-on-Unix.html

--enable-sse (single precision), --enable-sse2 (single, double), --enable-avx (single, double), --enable-avx2 (single, double), --enable-avx512 (single, double), --enable-avx-128-fma, --enable-kcvi (single), --enable-altivec (single), --enable-vsx (single, double), --enable-neon (single, double on aarch64), --enable-generic-simd128, and --enable-generic-simd256:

Enable various SIMD instruction sets. You need compiler that supports the given SIMD extensions, but FFTW will try to detect at runtime whether the CPU supports these extensions. That is, you can compile with--enable-avx and the code will still run on a CPU without AVX support.

Please enable also the other options for the other architectures

Please enable also the other options for the other architectures

Support for altivec was intentionally turned off in d2d0f9c, apparently since it works in single precision only? I don't know why it couldn't be turned on just for the single precision build, since each size is a separate build of fftw.

NEON for arm was turned off in 310c967. Apparently it lacks runtime detection, or at least did then.

I don't think I have an ARM systems that runs Linux without NEON to test with. And I don't have anything with altivec at all.

NEON on arm was supposed to be auto-detected on Linux since 2011.

Don't know why it was disabled on Fedora in 2014.

5 new commits added

  • Enable Altivec for single precision
  • Fix building with no MPI types enabled
  • Allow openmpi tests on systems with less than 4 processors
  • Reduce precision list duplication
  • Enable NEON on aarch64
9 months ago

Metadata Update from @dcantrell:
- Request assigned

9 months ago

I do not know the history of why things were turned off in 2014, but generally I see software like this disable CPU-specifics when there are known limitations in the toolchain. So it may have just been that AVX2 in fftw didn't compile in 2014 and the maintainer disabled it. This is entirely a guess. I know I've done that before in other packages.

I see the Copr build succeeds, but the Fedora CI scratch build is failing on i686. A segfault when mpirun is used in the test suite:

erl -w ../../tests/check.pl --verbose --random --maxsize=10000 -c=10  --mpi "mpirun --oversubscribe -np 2 `pwd`/mpi-bench"
Executing "mpirun --oversubscribe -np 2 /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/mpi-bench --verbose=1   --verify 'obcd1152' --verify 'ibcd1152' --verify 'ofcd1152' --verify 'ifcd1152' --verify 'ok[4e10x8e01x9o11' --verify 'ik[4e10x8e01x9o11' --verify 'ofrd]8x9x9x10' --verify 'ifrd]8x9x9x10' --verify 'obcd]8x9x9x10' --verify 'ibcd]8x9x9x10' --verify 'ofcd]8x9x9x10' --verify 'ifcd]8x9x9x10' --verify 'ok[12hx5e10x8o11v8' --verify 'ik[12hx5e10x8o11v8' --verify 'obr5x2x6x9' --verify 'ibr5x2x6x9' --verify 'ofr5x2x6x9' --verify 'ifr5x2x6x9' --verify 'obc5x2x6x9' --verify 'ibc5x2x6x9' --verify 'ofc5x2x6x9' --verify 'ifc5x2x6x9' --verify 'okd[6e11x2o00x8o11x8b' --verify 'ikd[6e11x2o00x8o11x8b' --verify 'ofr]10x2v2' --verify 'ifr]10x2v2' --verify 'obc]10x2v2' --verify 'ibc]10x2v2' --verify 'ofc]10x2v2' --verify 'ifc]10x2v2' --verify 'ok[20o00x63b' --verify 'ik[20o00x63b' --verify 'okd[7o01x8o01x9bx8b' --verify 'ikd[7o01x8o01x9bx8b' --verify 'obr11x11v21' --verify 'ibr11x11v21' --verify 'ofr11x11v21' --verify 'ifr11x11v21' --verify 'obc11x11v21' --verify 'ibc11x11v21' --verify 'ofc11x11v21' --verify 'ifc11x11v21' --verify 'okd6e01x13e00x4e01v1' --verify 'ikd6e01x13e00x4e01v1'"
[67be17e071c84b9b9b93aae808f19bf5:306533] *** Process received signal ***
[67be17e071c84b9b9b93aae808f19bf5:306533] Signal: Segmentation fault (11)
[67be17e071c84b9b9b93aae808f19bf5:306533] Signal code: Address not mapped (1)
[67be17e071c84b9b9b93aae808f19bf5:306533] Failing at address: 0x6c
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 0] linux-gate.so.1(__kernel_rt_sigreturn+0x0)[0xf7f3f580]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 1] /usr/lib/openmpi/lib/openmpi/mca_pml_ob1.so(+0x129f5)[0xf63799f5]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 2] /usr/lib/openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_isend+0x2f7)[0xf6377b77]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 3] /usr/lib/openmpi/lib/openmpi/mca_coll_libnbc.so(+0x7e2c)[0xf57e7e2c]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 4] /usr/lib/openmpi/lib/openmpi/mca_coll_libnbc.so(NBC_Start+0x37)[0xf57e8767]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 5] /usr/lib/openmpi/lib/openmpi/mca_coll_libnbc.so(ompi_coll_libnbc_iallreduce+0x4d)[0xf57ebdfd]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 6] /usr/lib/openmpi/lib/libmpi.so.40(+0x212c6)[0xf7c202c6]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 7] /usr/lib/openmpi/lib/libmpi.so.40(+0x26730)[0xf7c25730]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 8] /usr/lib/openmpi/lib/libmpi.so.40(+0x22f5b)[0xf7c21f5b]
[67be17e071c84b9b9b93aae808f19bf5:306533] [ 9] /usr/lib/openmpi/lib/libopen-pal.so.40(opal_progress+0x30)[0xf78ae6d0]
[67be17e071c84b9b9b93aae808f19bf5:306533] [10] /usr/lib/openmpi/lib/libopen-pal.so.40(ompi_sync_wait_mt+0x115)[0xf78bf455]
[67be17e071c84b9b9b93aae808f19bf5:306533] [11] /usr/lib/openmpi/lib/libmpi.so.40(ompi_comm_nextcid+0x1b0)[0xf7c26f40]
[67be17e071c84b9b9b93aae808f19bf5:306533] [12] /usr/lib/openmpi/lib/libmpi.so.40(ompi_comm_dup_with_info+0xd9)[0xf7c294b9]
[67be17e071c84b9b9b93aae808f19bf5:306533] [13] /usr/lib/openmpi/lib/libmpi.so.40(ompi_comm_dup+0x22)[0xf7c295e2]
[67be17e071c84b9b9b93aae808f19bf5:306533] [14] /usr/lib/openmpi/lib/libmpi.so.40(MPI_Comm_dup+0x67)[0xf7c5ec47]
[67be17e071c84b9b9b93aae808f19bf5:306533] [15] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/libfftw3f_mpi.so.3(fftwf_mpi_mkproblem_dft+0xd3)[0xf7f2bce3]
[67be17e071c84b9b9b93aae808f19bf5:306533] [16] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/libfftw3f_mpi.so.3(fftwf_mpi_mkproblem_dft_d+0x37)[0xf7f2bd47]
[67be17e071c84b9b9b93aae808f19bf5:306533] [17] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/libfftw3f_mpi.so.3(fftwf_mpi_plan_guru_dft+0x105)[0xf7f2be65]
[67be17e071c84b9b9b93aae808f19bf5:306533] [18] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/libfftw3f_mpi.so.3(fftwf_mpi_plan_many_dft+0xa2)[0xf7f2bf42]
[67be17e071c84b9b9b93aae808f19bf5:306533] [19] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/mpi-bench(+0xb5fe)[0x566335fe]
[67be17e071c84b9b9b93aae808f19bf5:306533] [20] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/mpi-bench(+0xc46c)[0x5663446c]
[67be17e071c84b9b9b93aae808f19bf5:306533] [21] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/mpi-bench(+0x30ef)[0x5662b0ef]
[67be17e071c84b9b9b93aae808f19bf5:306533] [22] /lib/libc.so.6(+0x23899)[0xf7a34899]
[67be17e071c84b9b9b93aae808f19bf5:306533] [23] /lib/libc.so.6(__libc_start_main+0x8c)[0xf7a3495c]
[67be17e071c84b9b9b93aae808f19bf5:306533] [24] /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/.libs/mpi-bench(+0x3feb)[0x5662bfeb]
[67be17e071c84b9b9b93aae808f19bf5:306533] *** End of error message ***
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 0 on node 67be17e071c84b9b9b93aae808f19bf5 exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
FAILED mpirun --oversubscribe -np 2 /builddir/build/BUILD/fftw-3.3.10/openmpi-single/mpi/mpi-bench:  --verify 'obcd1152' --verify 'ibcd1152' --verify 'ofcd1152' --verify 'ifcd1152' --verify 'ok[4e10x8e01x9o11' --verify 'ik[4e10x8e01x9o11' --verify 'ofrd]8x9x9x10' --verify 'ifrd]8x9x9x10' --verify 'obcd]8x9x9x10' --verify 'ibcd]8x9x9x10' --verify 'ofcd]8x9x9x10' --verify 'ifcd]8x9x9x10' --verify 'ok[12hx5e10x8o11v8' --verify 'ik[12hx5e10x8o11v8' --verify 'obr5x2x6x9' --verify 'ibr5x2x6x9' --verify 'ofr5x2x6x9' --verify 'ifr5x2x6x9' --verify 'obc5x2x6x9' --verify 'ibc5x2x6x9' --verify 'ofc5x2x6x9' --verify 'ifc5x2x6x9' --verify 'okd[6e11x2o00x8o11x8b' --verify 'ikd[6e11x2o00x8o11x8b' --verify 'ofr]10x2v2' --verify 'ifr]10x2v2' --verify 'obc]10x2v2' --verify 'ibc]10x2v2' --verify 'ofc]10x2v2' --verify 'ifc]10x2v2' --verify 'ok[20o00x63b' --verify 'ik[20o00x63b' --verify 'okd[7o01x8o01x9bx8b' --verify 'ikd[7o01x8o01x9bx8b' --verify 'obr11x11v21' --verify 'ibr11x11v21' --verify 'ofr11x11v21' --verify 'ifr11x11v21' --verify 'obc11x11v21' --verify 'ibc11x11v21' --verify 'ofc11x11v21' --verify 'ifc11x11v21' --verify 'okd6e01x13e00x4e01v1' --verify 'ikd6e01x13e00x4e01v1'

Copr doesn't build i686, but Fedora does so this will need to work there too.

Copr doesn't build i686, but Fedora does so this will need to work there too.

The first build done, when only x86_64 was modified, failed on i686 in the same way. The i686 build is exactly the same as before. This change shouldn't have anything to do with this failure. Only x86_64, arm64, and ppc have been modified.

Is there a way to see a build of the current fftw against f38 on i686? I can't find one.

1 new commit added

  • Enable cycle counter on arm64
9 months ago

@dcantrell The build failure on i686 isn't related to this PR. It's something else that's changed in rawhide.

openmpi could be disabled on i686? Do you have any other suggestions?

Without a 32-bit system, I don't think I can debug the unrelated i686 problem.

Let me try and see what I can determine.

Any news on this?

Can this be merged and a ticket created for the i686 problem? It's not related to the PR. A fix for it would be out of scope for the PR anyway.

You completely erased my authorship! Signed-off-by, changelog, commit, everything. That is not cool.

I did not meant to do that deliberately. Let me fix that up.

Alternatively I will revert what I did and you can rebase your PR so it can apply. The spec file now uses the make macros, so that at least was preventing it from applying. Let me know what you want to do.

OK, I reverted my rollup commit. Rebase your PR against rawhide so it applies. Release number increments are in conflict, and changelog dates need to be updated to be in order. Once it applies, I can merge it.

rebased onto eb1985b

5 months ago

Thanks. I've rebased and update the release number and changelog dates. Looks like everything passes now.

I had hoped to get this into F38.

I will merge it here and then apply it to F38 and do an update for that release.

Pull-Request has been merged by dcantrell

5 months ago
Metadata