Mock Version: 5.5 Mock Version: 5.5 Mock Version: 5.5 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-383670-71809/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1718755200 Wrote: /builddir/build/SRPMS/libnccl-2.20.5-1.cuda12.1.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-383670-71809/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1718755200 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.UGtIeY + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf nccl-2.20.5-1 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/nccl-2.20.5-1.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd nccl-2.20.5-1 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + /usr/lib/rpm/rpmuncompress /builddir/build/SOURCES/1000-fix-lib-path-in-nccl.pc.patch + /usr/bin/patch -p1 -s --fuzz=0 --no-backup-if-mismatch -f + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.KzBklW + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.20.5-1 + export LD_LIBRARY_PATH=/usr/local/cuda-12-1/lib64 + LD_LIBRARY_PATH=/usr/local/cuda-12-1/lib64 + export 'CFLAGS=usr/local/cuda-12-1/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + CFLAGS='usr/local/cuda-12-1/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export PREFIX=/usr + PREFIX=/usr + /usr/bin/make -O -j80 V=1 VERBOSE=1 /usr/bin/make -C src build BUILDDIR=/builddir/build/BUILD/nccl-2.20.5-1/build make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' mkdir -p /builddir/build/BUILD/nccl-2.20.5-1/build/include Generating nccl.h.in > /builddir/build/BUILD/nccl-2.20.5-1/build/include/nccl.h sed -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/20/g" \ -e "s/\${nccl:Patch}/5/g" \ -e "s/\${nccl:Suffix}//g" \ -e "s/\${nccl:Version}/22005/g" \ nccl.h.in > /builddir/build/BUILD/nccl-2.20.5-1/build/include/nccl.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Grabbing include/nccl_net.h > /builddir/build/BUILD/nccl-2.20.5-1/build/include/nccl_net.h mkdir -p /builddir/build/BUILD/nccl-2.20.5-1/build/include install -m 644 include/nccl_net.h /builddir/build/BUILD/nccl-2.20.5-1/build/include/nccl_net.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' mkdir -p /builddir/build/BUILD/nccl-2.20.5-1/build/lib/pkgconfig Generating nccl.pc.in > /builddir/build/BUILD/nccl-2.20.5-1/build/lib/pkgconfig/nccl.pc sed -e 's|${nccl:Prefix}|\/usr|g' \ -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/20/g" \ -e "s/\${nccl:Patch}/5/g" \ nccl.pc.in > /builddir/build/BUILD/nccl-2.20.5-1/build/lib/pkgconfig/nccl.pc make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' /usr/bin/make -C ./device make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling enhcompat.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enhcompat.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enhcompat.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enhcompat.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enhcompat.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' (which python3 >/dev/null || \ (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \ printf "\n${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n${bar}\n\n" 1>&2; \ exit 1)) \ && ./generate.py /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/gensrc "" make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/trees.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/trees.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/trees.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/trees.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/trees.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/tuner.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/tuner.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/tuner.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/tuner.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/tuner.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/host_table.cc make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/param.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/param.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/param.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/param.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/param.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/cudawrap.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/cudawrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/cudawrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/cudawrap.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/cudawrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/host_table.cc make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' In file included from /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/gensrc/host_table.cc:1: ../include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': ../include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ ../include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': ../include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/ipcsocket.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ipcsocket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ipcsocket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ipcsocket.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ipcsocket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/strongstream.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/strongstream.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/strongstream.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/strongstream.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/strongstream.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/nvmlwrap.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/nvmlwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/nvmlwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/nvmlwrap.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/nvmlwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/socket.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/socket.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies src/device/common.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/device_table.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/all_gather.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/broadcast.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/reduce_scatter.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/sendrecv.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/all_reduce.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies build/obj/device/gensrc/reduce.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Dependencies src/device/onerank.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling init_nvtx.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init_nvtx.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init_nvtx.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init_nvtx.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init_nvtx.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from init_nvtx.cc:2: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] 10 | }; | ^ init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/profiler.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/profiler.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/profiler.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/profiler.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/profiler.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/proxy.h:10, from include/profiler.h:10, from misc/profiler.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/proxy.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ misc/profiler.cc: In function 'ncclResult_t ncclProfilingRecord(ncclProxyArgs*, int, int, int)': misc/profiler.cc:113:56: warning: unused parameter 'args' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~~~~~~~~~~~~~~~~~~~^~~~ misc/profiler.cc:113:66: warning: unused parameter 'sub' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~ misc/profiler.cc:113:75: warning: unused parameter 'step' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~ misc/profiler.cc:113:85: warning: unused parameter 'state' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/rings.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/rings.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/rings.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/rings.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/rings.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from graph/rings.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/rings.cc: In function 'ncclResult_t ncclBuildRings(int, int*, int, int, int*, int*)': graph/rings.cc:22:80: warning: unused parameter 'prev' [-Wunused-parameter] 22 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { | ~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/argcheck.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/argcheck.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/argcheck.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/argcheck.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/argcheck.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from include/argcheck.h:10, from misc/argcheck.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/info.h:11, from include/argcheck.h:11: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/ibvsymbols.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvsymbols.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvsymbols.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvsymbols.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvsymbols.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from misc/ibvsymbols.cc:67: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling debug.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/debug.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/debug.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c debug.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/debug.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from debug.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/shmutils.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/shmutils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/shmutils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/shmutils.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/shmutils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from misc/shmutils.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/ibvwrap.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvwrap.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from include/ibvwrap.h:21, from misc/ibvwrap.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/utils.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/utils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/utils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/utils.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/utils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from misc/utils.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling misc/gdrwrap.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/gdrwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/gdrwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/gdrwrap.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/gdrwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from misc/gdrwrap.cc:10: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling register.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/register.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/register.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c register.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/register.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from include/argcheck.h:10, from register.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/info.h:11, from include/argcheck.h:11: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = void*; size_t = long unsigned int]': register.cc:49:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclReg; size_t = long unsigned int]': register.cc:118:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling group.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/group.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/group.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c group.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/group.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/group.h:11, from group.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPreconnectJob; size_t = long unsigned int]': group.cc:280:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling net.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c net.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/net.h:12, from net.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ net.cc: In function 'ncclResult_t ncclNetCheckDeviceVersion(ncclComm*, ncclNet_t*, int)': net.cc:436:57: warning: unused parameter 'comm' [-Wunused-parameter] 436 | ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { | ~~~~~~~~~~~~~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling channel.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/channel.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/channel.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c channel.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/channel.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/channel.h:9, from channel.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclChannelPeer; size_t = long unsigned int]': channel.cc:29:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclDevChannelPeer*; size_t = long unsigned int]': channel.cc:45:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/net_socket.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_socket.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/net_socket.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketInit(ncclDebugLogger_t)': transport/net_socket.cc:38:50: warning: unused parameter 'logFunction' [-Wunused-parameter] 38 | ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketRegMr(void*, void*, size_t, int, void**)': transport/net_socket.cc:538:39: warning: unused parameter 'comm' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:538:51: warning: unused parameter 'data' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:538:64: warning: unused parameter 'size' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~~^~~~ transport/net_socket.cc:538:87: warning: unused parameter 'mhandle' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketDeregMr(void*, void*)': transport/net_socket.cc:541:41: warning: unused parameter 'comm' [-Wunused-parameter] 541 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~ transport/net_socket.cc:541:53: warning: unused parameter 'mhandle' [-Wunused-parameter] 541 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIsend(void*, void*, int, int, void*, void**)': transport/net_socket.cc:543:75: warning: unused parameter 'tag' [-Wunused-parameter] 543 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~^~~ transport/net_socket.cc:543:86: warning: unused parameter 'mhandle' [-Wunused-parameter] 543 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIrecv(void*, int, void**, int*, int*, void**, void**)': transport/net_socket.cc:549:86: warning: unused parameter 'tags' [-Wunused-parameter] 549 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~^~~~ transport/net_socket.cc:549:99: warning: unused parameter 'mhandles' [-Wunused-parameter] 549 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIflush(void*, int, void**, int*, void**, void**)': transport/net_socket.cc:556:40: warning: unused parameter 'recvComm' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~^~~~~~~~ transport/net_socket.cc:556:54: warning: unused parameter 'n' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~^ transport/net_socket.cc:556:64: warning: unused parameter 'data' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~ transport/net_socket.cc:556:75: warning: unused parameter 'sizes' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~^~~~~ transport/net_socket.cc:556:89: warning: unused parameter 'mhandles' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc:556:106: warning: unused parameter 'request' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketListenComm; size_t = long unsigned int]': transport/net_socket.cc:294:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketComm; size_t = long unsigned int]': transport/net_socket.cc:323:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': transport/net_socket.cc:373:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketTask; size_t = long unsigned int]': transport/net_socket.cc:435:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclConnect*; size_t = long unsigned int]': transport.cc:82:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclConnect; size_t = long unsigned int]': transport.cc:106:31: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTransportCollNetSetup(ncclComm*, ncclTopoGraph*, ncclChannel*, int, int, int, int)::; size_t = long unsigned int]': transport.cc:318:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/shm.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/shm.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/shm.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/shm.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/shm.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/shm.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/shm.cc: In function 'ncclResult_t shmCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/shm.cc:50:96: warning: unused parameter 'graph' [-Wunused-parameter] 50 | static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc: In function 'ncclResult_t shmSendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:76:79: warning: unused parameter 'graph' [-Wunused-parameter] 76 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:76:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 76 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:99:79: warning: unused parameter 'graph' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:99:107: warning: unused parameter 'myInfo' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~ transport/shm.cc:99:136: warning: unused parameter 'peerInfo' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/shm.cc:99:211: warning: unused parameter 'channelId' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc:99:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 161 | struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; | ^ transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:135:96: warning: unused parameter 'nranks' [-Wunused-parameter] 135 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/shm.cc:135:108: warning: unused parameter 'rank' [-Wunused-parameter] 135 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:195:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 195 | struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; | ^ transport/shm.cc:195:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:195:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:174:96: warning: unused parameter 'nranks' [-Wunused-parameter] 174 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/shm.cc:174:108: warning: unused parameter 'rank' [-Wunused-parameter] 174 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:227:179: warning: unused parameter 'done' [-Wunused-parameter] 227 | static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:245:179: warning: unused parameter 'done' [-Wunused-parameter] 245 | static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:263:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 263 | static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:278:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 278 | static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmSendResources; size_t = long unsigned int]': transport/shm.cc:78:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmRecvResources; size_t = long unsigned int]': transport/shm.cc:101:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmProxyInfo; size_t = long unsigned int]': transport/shm.cc:229:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling collectives.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/collectives.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/collectives.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/collectives.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from include/argcheck.h:10, from collectives.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/info.h:11, from include/argcheck.h:11: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ collectives.cc: In function 'ncclResult_t ncclAllGather(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t)': collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 19 | }; | ^ collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 25 | ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; | ^ collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclAllReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 42 | }; | ^ collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 48 | ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; | ^ collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 63 | }; | ^ collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 69 | BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; | ^ collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, cudaStream_t)': collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 94 | }; | ^ collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 100 | REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; | ^ collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 116 | }; | ^ collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 122 | REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; | ^ collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] collectives.cc: At global scope: collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 133 | }; | ^ collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 144 | 1, 1 }; | ^ collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 161 | 1, 1 }; | ^ collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/p2p.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/p2p.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/p2p.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/p2p.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/p2p.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/p2p.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/p2p.cc: In function 'ncclResult_t p2pCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/p2p.cc:103:89: warning: unused parameter 'graph' [-Wunused-parameter] 103 | ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/p2p.cc: In function 'ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc*)': transport/p2p.cc:228:54: warning: unused parameter 'ipcDesc' [-Wunused-parameter] 228 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { | ~~~~~~~~~~~~~^~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pMap(ncclComm*, ncclProxyConnector*, ncclPeerInfo*, ncclPeerInfo*, ncclP2pBuff*, void**, void**)': transport/p2p.cc:297:78: warning: unused parameter 'proxyConn' [-Wunused-parameter] 297 | static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/p2p.cc:402:71: warning: unused parameter 'channelId' [-Wunused-parameter] 402 | struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:452:96: warning: unused parameter 'nranks' [-Wunused-parameter] 452 | static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:490:89: warning: unused parameter 'nranks' [-Wunused-parameter] 490 | ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:609:102: warning: unused parameter 'proxyState' [-Wunused-parameter] 609 | static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:629:104: warning: unused parameter 'proxyState' [-Wunused-parameter] 629 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc:629:150: warning: unused parameter 'respBuff' [-Wunused-parameter] 629 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/p2p.cc:629:164: warning: unused parameter 'respSize' [-Wunused-parameter] 629 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/p2p.cc:629:179: warning: unused parameter 'done' [-Wunused-parameter] 629 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:643:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 643 | static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:675:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 675 | static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pResources; size_t = long unsigned int]': transport/p2p.cc:338:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pShmProxyInfo; size_t = long unsigned int]': transport/p2p.cc:571:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pCuMemProxyInfo; size_t = long unsigned int]': transport/p2p.cc:598:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/connect.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/connect.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/connect.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/connect.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/connect.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from graph/connect.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/connect.cc: In function 'ncclResult_t connectTrees(ncclComm*, int*, int*, int*, int*)': graph/connect.cc:133:119: warning: unused parameter 'treePatterns' [-Wunused-parameter] 133 | static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { | ~~~~~^~~~~~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/connect.cc:177:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling bootstrap.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/bootstrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/bootstrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c bootstrap.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/bootstrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from bootstrap.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/transport.h:10, from include/comm.h:10, from include/bootstrap.h:11, from bootstrap.cc:10: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ bootstrap.cc: In function 'ncclResult_t bootstrapCreateRoot(ncclBootstrapHandle*, bool)': bootstrap.cc:173:75: warning: unused parameter 'idFromEnv' [-Wunused-parameter] 173 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) { | ~~~~~^~~~~~~~~ bootstrap.cc: In function 'ncclResult_t bootstrapInit(ncclBootstrapHandle*, ncclComm*)': bootstrap.cc:240:29: warning: missing initializer for member 'extInfo::nranks' [-Wmissing-field-initializers] 240 | struct extInfo info = { 0 }; | ^ bootstrap.cc:240:29: warning: missing initializer for member 'extInfo::extAddressListenRoot' [-Wmissing-field-initializers] bootstrap.cc:240:29: warning: missing initializer for member 'extInfo::extAddressListen' [-Wmissing-field-initializers] In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocketAddress; size_t = long unsigned int]': bootstrap.cc:111:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': bootstrap.cc:178:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapRootArgs; size_t = long unsigned int]': bootstrap.cc:183:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapState; size_t = long unsigned int]': bootstrap.cc:242:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': bootstrap.cc:299:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unexConn; size_t = long unsigned int]': bootstrap.cc:495:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/nvls.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/nvls.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/nvls.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/nvls.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/nvls.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/nvls.cc:9: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/nvls.cc: In function 'ncclResult_t nvlsCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/nvls.cc:28:62: warning: unused parameter 'topo' [-Wunused-parameter] 28 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:28:90: warning: unused parameter 'graph' [-Wunused-parameter] 28 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc:28:118: warning: unused parameter 'info1' [-Wunused-parameter] 28 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc:28:146: warning: unused parameter 'info2' [-Wunused-parameter] 28 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc: In function 'ncclResult_t nvlsSendFree(ncclConnector*)': transport/nvls.cc:34:49: warning: unused parameter 'send' [-Wunused-parameter] 34 | ncclResult_t nvlsSendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsRecvFree(ncclConnector*)': transport/nvls.cc:38:49: warning: unused parameter 'recv' [-Wunused-parameter] 38 | ncclResult_t nvlsRecvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGetProperties(ncclComm*, ncclNvlsSharedRes*, int, int, size_t)': transport/nvls.cc:49:49: warning: unused parameter 'comm' [-Wunused-parameter] 49 | ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupCreate(ncclComm*, CUmulticastObjectProp*, int, unsigned int, CUmemGenericAllocationHandle*, char*)': transport/nvls.cc:72:47: warning: unused parameter 'comm' [-Wunused-parameter] 72 | ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupAddDevice(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:93:50: warning: unused parameter 'comm' [-Wunused-parameter] 93 | ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupDisconnect(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:125:51: warning: unused parameter 'comm' [-Wunused-parameter] 125 | ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:125:83: warning: unused parameter 'resources' [-Wunused-parameter] 125 | ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupBindMem(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:129:48: warning: unused parameter 'comm' [-Wunused-parameter] 129 | ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupMapMem(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:184:47: warning: unused parameter 'comm' [-Wunused-parameter] 184 | ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupUnmapMem(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:202:49: warning: unused parameter 'comm' [-Wunused-parameter] 202 | ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNvlsSharedRes; size_t = long unsigned int]': transport/nvls.cc:294:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = localRegData; size_t = long unsigned int]': transport/nvls.cc:434:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bool; size_t = long unsigned int]': transport/nvls.cc:646:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = graphRegData; size_t = long unsigned int]': transport/nvls.cc:647:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/paths.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/paths.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/paths.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/paths.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/paths.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from graph/paths.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/paths.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/paths.cc: In function 'ncclResult_t ncclTopoCheckMNNVL(ncclTopoSystem*, ncclPeerInfo*, ncclPeerInfo*, int*)': graph/paths.cc:345:56: warning: unused parameter 'system' [-Wunused-parameter] 345 | ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoLinkList; size_t = long unsigned int]': graph/paths.cc:36:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/paths.cc:639:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/paths.cc:640:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/coll_net.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/coll_net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/coll_net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/coll_net.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/coll_net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/coll_net.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/coll_net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/coll_net.cc:134:65: warning: unused parameter 'topo' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc:134:93: warning: unused parameter 'graph' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:134:121: warning: unused parameter 'info1' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:134:149: warning: unused parameter 'info2' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 151 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:150:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 150 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc:150:163: warning: unused parameter 'connectInfo' [-Wunused-parameter] 150 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 171 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:170:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 170 | static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t sendFree(ncclConnector*)': transport/coll_net.cc:293:52: warning: unused parameter 'send' [-Wunused-parameter] 293 | static ncclResult_t sendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvFree(ncclConnector*)': transport/coll_net.cc:297:52: warning: unused parameter 'recv' [-Wunused-parameter] 297 | static ncclResult_t recvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:301:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 301 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/coll_net.cc:301:159: warning: unused parameter 'respSize' [-Wunused-parameter] 301 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/coll_net.cc:301:174: warning: unused parameter 'done' [-Wunused-parameter] 301 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:411:174: warning: unused parameter 'done' [-Wunused-parameter] 411 | static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:436:176: warning: unused parameter 'done' [-Wunused-parameter] 436 | static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:510:176: warning: unused parameter 'done' [-Wunused-parameter] 510 | static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'constexpr int calcStepsPerGroup(int)': transport/coll_net.cc:666:44: warning: unused parameter 'nGroups' [-Wunused-parameter] 666 | static constexpr int calcStepsPerGroup(int nGroups) { | ~~~~^~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendResources; size_t = long unsigned int]': transport/coll_net.cc:306:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sharedResources; size_t = long unsigned int]': transport/coll_net.cc:329:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char (*)[128]; size_t = long unsigned int]': transport/coll_net.cc:342:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvResources; size_t = long unsigned int]': transport/coll_net.cc:416:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/coll_net.cc:469:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling proxy.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/proxy.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/proxy.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c proxy.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/proxy.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from proxy.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ proxy.cc: In function 'void ncclDumpProxyState(int)': proxy.cc:802:29: warning: unused parameter 'signal' [-Wunused-parameter] 802 | void ncclDumpProxyState(int signal) { | ~~~~^~~~~~ proxy.cc: In function 'ncclResult_t ncclProxyConnect(ncclComm*, int, int, int, ncclProxyConnector*)': proxy.cc:1054:35: warning: missing initializer for member 'ncclProxyInitReq::send' [-Wmissing-field-initializers] 1054 | struct ncclProxyInitReq req = {0}; | ^ proxy.cc:1054:35: warning: missing initializer for member 'ncclProxyInitReq::tpLocalRank' [-Wmissing-field-initializers] proxy.cc:1054:35: warning: missing initializer for member 'ncclProxyInitReq::tpRank' [-Wmissing-field-initializers] proxy.cc:1054:35: warning: missing initializer for member 'ncclProxyInitReq::sameProcess' [-Wmissing-field-initializers] proxy.cc:1061:37: warning: missing initializer for member 'ncclProxyInitResp::devShmPath' [-Wmissing-field-initializers] 1061 | struct ncclProxyInitResp resp = {0}; | ^ proxy.cc: In function 'ncclResult_t ncclProxyCallBlockingUDS(ncclComm*, int, int, void*, int, void*, int, int*)': proxy.cc:1085:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1085 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1085:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t ncclPollProxyResponse(ncclComm*, ncclProxyConnector*, void*, void*)': proxy.cc:1182:41: warning: missing initializer for member 'ncclProxyRpcResponseHeader::res' [-Wmissing-field-initializers] 1182 | ncclProxyRpcResponseHeader resp = {0}; | ^ proxy.cc:1182:41: warning: missing initializer for member 'ncclProxyRpcResponseHeader::respSize' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t proxyGetFd(ncclProxyState*, int, void*, uint64_t)': proxy.cc:1329:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1329 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1329:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t proxyUDSRecvReq(ncclProxyState*, int)': proxy.cc:1600:76: warning: unused parameter 'reqFd' [-Wunused-parameter] 1600 | static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) { | ~~~~^~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclExpectedProxyResponse; size_t = long unsigned int]': proxy.cc:84:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPool; size_t = long unsigned int]': proxy.cc:198:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyConnection; size_t = long unsigned int]': proxy.cc:960:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': proxy.cc:1038:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyOps; size_t = long unsigned int]': proxy.cc:1039:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = void*; size_t = long unsigned int]': proxy.cc:1040:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyAsyncOp; size_t = long unsigned int]': proxy.cc:1405:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': proxy.cc:1413:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyState; size_t = long unsigned int]': proxy.cc:1657:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/topo.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/topo.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/topo.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/topo.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/topo.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from graph/topo.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/topo.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/topo.cc: In function 'ncclResult_t pciPathToInt64(char*, int, int, int64_t*)': graph/topo.cc:31:57: warning: unused parameter 'minOffset' [-Wunused-parameter] 31 | ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { | ~~~~^~~~~~~~~ graph/topo.cc: In function 'ncclResult_t ncclTopoAddGpu(ncclXmlNode*, ncclTopoSystem*, ncclTopoNode*)': graph/topo.cc:371:80: warning: unused parameter 'system' [-Wunused-parameter] 371 | ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/topo.cc:196:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoSystem; size_t = long unsigned int]': graph/topo.cc:582:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclXml; size_t = long unsigned int]': graph/topo.cc:636:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/topo.cc:725:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/net.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/net.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/net.cc:144:93: warning: unused parameter 'graph' [-Wunused-parameter] 144 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:174:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 174 | struct setupReq req = { 0 }; | ^ transport/net.cc:174:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:211:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 211 | struct setupReq req = { 0 }; | ^ transport/net.cc:211:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t sendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:284:93: warning: unused parameter 'nranks' [-Wunused-parameter] 284 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/net.cc:284:105: warning: unused parameter 'rank' [-Wunused-parameter] 284 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t recvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:384:93: warning: unused parameter 'nranks' [-Wunused-parameter] 384 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/net.cc:384:105: warning: unused parameter 'rank' [-Wunused-parameter] 384 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/net.cc:560:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 560 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = connectMap; size_t = long unsigned int]': transport/net.cc:292:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer*; size_t = long unsigned int]': transport/net.cc:487:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer; size_t = long unsigned int]': transport/net.cc:491:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendNetResources; size_t = long unsigned int]': transport/net.cc:565:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvNetResources; size_t = long unsigned int]': transport/net.cc:598:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetDeviceHandle_v7_t; size_t = long unsigned int]': transport/net.cc:637:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedNetComms; size_t = long unsigned int]': transport/net.cc:668:9: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/net.cc:748:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/xml.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/xml.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/xml.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/xml.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/xml.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from graph/xml.cc:13: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/xml.cc: In function 'ncclResult_t ncclTopoGetXmlFromCpu(ncclXmlNode*, ncclXml*)': graph/xml.cc:378:81: warning: unused parameter 'xml' [-Wunused-parameter] 378 | ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { | ~~~~~~~~~~~~~~~~^~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling transport/net_ib.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_ib.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_ib.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_ib.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_ib.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from transport/net_ib.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/transport.h:10, from include/comm.h:10, from include/net.h:12, from transport/net_ib.cc:10: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/net_ib.cc: In function 'ncclResult_t ncclIbInit(ncclDebugLogger_t)': transport/net_ib.cc:184:43: warning: unused parameter 'logFunction' [-Wunused-parameter] 184 | ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_ib.cc: In function 'ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase*, void*, size_t, int, uint64_t, int, ibv_mr**)': transport/net_ib.cc:1182:97: warning: unused parameter 'type' [-Wunused-parameter] 1182 | ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, ibv_mr** mhandle) { | ~~~~^~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclIbListenComm; size_t = long unsigned int]': transport/net_ib.cc:744:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/search.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/search.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/search.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/search.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/search.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from graph/search.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/search.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/search.cc: In function 'float getTotalBw(ncclTopoSystem*, ncclTopoNode*)': graph/search.cc:27:48: warning: unused parameter 'system' [-Wunused-parameter] 27 | static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/search.cc:400:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclXml; size_t = long unsigned int]': graph/search.cc:871:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling graph/tuning.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/tuning.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/tuning.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/tuning.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/tuning.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/core.h:39, from graph/tuning.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from graph/tuning.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling enqueue.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enqueue.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enqueue.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enqueue.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enqueue.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from enqueue.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ enqueue.cc: In function 'ncclResult_t ncclInitKernelsForDevice(int, size_t*)': enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::constSizeBytes' [-Wmissing-field-initializers] 46 | cudaFuncAttributes attr = {0}; | ^ enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::localSizeBytes' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::maxThreadsPerBlock' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::numRegs' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::ptxVersion' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::binaryVersion' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::cacheModeCA' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::maxDynamicSharedSizeBytes' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::preferredShmemCarveout' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::clusterDimMustBeSet' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterWidth' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterHeight' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterDepth' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::clusterSchedulingPolicyPreference' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::nonPortableClusterSizeAllowed' [-Wmissing-field-initializers] enqueue.cc:46:35: warning: missing initializer for member 'cudaFuncAttributes::reserved' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t addP2pToPlan(ncclComm*, ncclKernelPlan*, int*, bool, int, int, void*, size_t, bool)': enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 520 | }; | ^ enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::aggnBytes' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::workBytes' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::stepSize' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::chunkCount' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::workFuncIndex' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::regBufType' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::regBufSend' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::regBufRecv' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::userTuned' [-Wmissing-field-initializers] enqueue.cc:520:3: warning: missing initializer for member 'ncclInfo::next' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::proto' [-Wmissing-field-initializers] 542 | struct ncclWorkElemP2p elem = {0}; | ^ enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::p2pType' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::reg' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::nWarps' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::warpStart' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::ngroups' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::buffHi32' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::buffLo32' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::countHi32' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::countLo32' [-Wmissing-field-initializers] enqueue.cc:542:35: warning: missing initializer for member 'ncclWorkElemP2p::chunkSize' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t ncclLaunchKernel(ncclComm*, ncclKernelPlan*)': enqueue.cc:1331:41: warning: missing initializer for member 'cudaLaunchConfig_st::blockDim' [-Wmissing-field-initializers] 1331 | cudaLaunchConfig_t launchConfig = {0}; | ^ enqueue.cc:1331:41: warning: missing initializer for member 'cudaLaunchConfig_st::dynamicSmemBytes' [-Wmissing-field-initializers] enqueue.cc:1331:41: warning: missing initializer for member 'cudaLaunchConfig_st::stream' [-Wmissing-field-initializers] enqueue.cc:1331:41: warning: missing initializer for member 'cudaLaunchConfig_st::attrs' [-Wmissing-field-initializers] enqueue.cc:1331:41: warning: missing initializer for member 'cudaLaunchConfig_st::numAttrs' [-Wmissing-field-initializers] make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling src/device/common.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/device_table.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Compiling init.cc > /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init.o` g++ -I. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init.cc -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/channel.h:9, from init.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:370:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 370 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:371:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 371 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:116, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ init.cc: In function 'ncclResult_t commGetSplitInfo(ncclComm*, ncclComm*, int, int, int*, int*, int*)': init.cc:1414:55: warning: unused parameter 'comm' [-Wunused-parameter] 1414 | static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { | ~~~~~~~~~~~~~~~~~^~~~ init.cc: At global scope: init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 1770 | }; | ^ init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1770:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc: In function 'ncclResult_t ncclCommInitAll(ncclComm**, int, const int*)': init.cc:1797:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 1797 | }; | ^ init.cc:1797:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1797:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] init.cc:1797:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1797:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc: In function 'const char* ncclGetLastError(ncclComm_t)': init.cc:2221:41: warning: unused parameter 'comm' [-Wunused-parameter] 2221 | const char* ncclGetLastError(ncclComm_t comm) { | ~~~~~~~~~~~^~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': init.cc:360:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedResources; size_t = long unsigned int]': init.cc:368:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': init.cc:372:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = collNetTrySetup(ncclComm_t, ncclComm_t, ncclTopoGraph*)::collnetShareInfo; size_t = long unsigned int]': init.cc:623:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCollNetSharedRes; size_t = long unsigned int]': init.cc:679:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned char [4][10]; size_t = long unsigned int]': init.cc:729:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPeerInfo; size_t = long unsigned int]': init.cc:864:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = initTransportsRank(ncclComm*, ncclComm*)::allGatherInfo; size_t = long unsigned int]': init.cc:1058:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNodeRanks; size_t = long unsigned int]': init.cc:1092:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoRanks*; size_t = long unsigned int]': init.cc:1129:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclComm; size_t = long unsigned int]': init.cc:1731:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned int; size_t = long unsigned int]': init.cc:1733:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommInitRankAsyncJob; size_t = long unsigned int]': init.cc:1740:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommFinalizeAsyncJob; size_t = long unsigned int]': init.cc:1947:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = ncclWork; size_t = long unsigned int]' init.cc:424:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_i64_NVLS_SIMPLEv 376 bytes stack frame, 680 bytes spill stores, 1124 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_i32_NVLS_SIMPLEv 384 bytes stack frame, 716 bytes spill stores, 1052 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling src/device/onerank.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 55 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 55 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 57 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 385 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '__nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__a1af3aae_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__a1af3aae_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ../include/utils.h:46:13: warning: 'long int log2i(long int)' defined but not used [-Wunused-function] 46 | static long log2i(long n) { | ^~~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 432 bytes stack frame, 300 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 472 bytes stack frame, 332 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 360 bytes stack frame, 444 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 464 bytes stack frame, 324 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 440 bytes stack frame, 296 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 416 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 416 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 472 bytes stack frame, 332 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 320 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 456 bytes stack frame, 296 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 224 bytes stack frame, 220 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 360 bytes stack frame, 428 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 256 bytes stack frame, 284 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 248 bytes stack frame, 260 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 248 bytes stack frame, 260 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 248 bytes stack frame, 260 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 272 bytes stack frame, 288 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 384 bytes stack frame, 580 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 448 bytes stack frame, 312 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 232 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 232 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 232 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 264 bytes stack frame, 280 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 464 bytes stack frame, 332 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 368 bytes stack frame, 544 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 464 bytes stack frame, 324 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 448 bytes stack frame, 292 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 368 bytes stack frame, 552 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 264 bytes stack frame, 300 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 432 bytes stack frame, 300 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 496 bytes stack frame, 368 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 480 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 416 bytes stack frame, 264 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 416 bytes stack frame, 264 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 416 bytes stack frame, 264 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 464 bytes stack frame, 316 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 456 bytes stack frame, 792 bytes spill stores, 1356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 456 bytes stack frame, 296 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 592 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 416 bytes stack frame, 260 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_i64_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1252 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_i64_NVLS_SIMPLEv 600 bytes stack frame, 1868 bytes spill stores, 3440 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/broadcast.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 344 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 344 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 344 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 400 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 448 bytes stack frame, 712 bytes spill stores, 1384 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 392 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 392 bytes stack frame, 696 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_i32_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1268 bytes spill stores, 2128 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_i32_NVLS_SIMPLEv 600 bytes stack frame, 1868 bytes spill stores, 3440 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/sendrecv.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 128 bytes stack frame, 192 bytes spill stores, 268 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 256 bytes stack frame, 260 bytes spill stores, 304 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 128 bytes stack frame, 192 bytes spill stores, 268 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 256 bytes stack frame, 260 bytes spill stores, 304 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 128 bytes stack frame, 192 bytes spill stores, 268 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 256 bytes stack frame, 260 bytes spill stores, 304 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 128 bytes stack frame, 204 bytes spill stores, 264 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 288 bytes stack frame, 332 bytes spill stores, 404 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 256 bytes stack frame, 724 bytes spill stores, 1104 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 400 bytes stack frame, 804 bytes spill stores, 1160 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 176 bytes stack frame, 632 bytes spill stores, 844 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 408 bytes stack frame, 836 bytes spill stores, 1192 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 248 bytes stack frame, 264 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 480 bytes stack frame, 356 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 480 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 504 bytes stack frame, 812 bytes spill stores, 1436 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 384 bytes stack frame, 580 bytes spill stores, 952 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 424 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 504 bytes stack frame, 812 bytes spill stores, 1436 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 384 bytes stack frame, 580 bytes spill stores, 952 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 424 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 504 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 504 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 504 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 384 bytes stack frame, 428 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 536 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 772 bytes spill stores, 1012 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 372 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 816 bytes spill stores, 1068 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 532 bytes spill stores, 640 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 440 bytes stack frame, 640 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 648 bytes spill stores, 860 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 692 bytes spill stores, 892 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 332 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 504 bytes stack frame, 356 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 484 bytes spill stores, 592 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 332 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 504 bytes stack frame, 356 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 484 bytes spill stores, 592 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 332 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 504 bytes stack frame, 356 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 484 bytes spill stores, 592 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 404 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 560 bytes spill stores, 728 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 400 bytes stack frame, 556 bytes spill stores, 1064 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 544 bytes stack frame, 408 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 756 bytes spill stores, 988 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 344 bytes stack frame, 408 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 784 bytes spill stores, 1004 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 464 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 464 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 464 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 532 bytes spill stores, 640 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 432 bytes stack frame, 640 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 644 bytes spill stores, 856 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 688 bytes spill stores, 860 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 348 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 600 bytes spill stores, 736 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 348 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 600 bytes spill stores, 736 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 348 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 600 bytes spill stores, 736 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 716 bytes spill stores, 920 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 440 bytes stack frame, 668 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 804 bytes spill stores, 1088 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 384 bytes stack frame, 592 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 868 bytes spill stores, 1172 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 328 bytes stack frame, 432 bytes spill stores, 492 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 328 bytes stack frame, 432 bytes spill stores, 492 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 328 bytes stack frame, 432 bytes spill stores, 492 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 536 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 496 bytes spill stores, 600 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 432 bytes stack frame, 656 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 660 bytes spill stores, 864 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 376 bytes stack frame, 536 bytes spill stores, 976 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 692 bytes spill stores, 840 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 492 bytes spill stores, 612 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 520 bytes stack frame, 384 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 492 bytes spill stores, 612 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 520 bytes stack frame, 384 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 492 bytes spill stores, 612 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 728 bytes spill stores, 940 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 432 bytes stack frame, 644 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 528 bytes stack frame, 396 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 768 bytes spill stores, 1068 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 384 bytes stack frame, 528 bytes spill stores, 1024 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 792 bytes spill stores, 1060 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 536 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 604 bytes spill stores, 736 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 432 bytes stack frame, 656 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 660 bytes spill stores, 872 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 376 bytes stack frame, 536 bytes spill stores, 976 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 692 bytes spill stores, 876 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 464 bytes stack frame, 648 bytes spill stores, 1172 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 608 bytes spill stores, 812 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 376 bytes stack frame, 528 bytes spill stores, 964 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 620 bytes spill stores, 752 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 352 bytes stack frame, 392 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 352 bytes stack frame, 392 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 352 bytes stack frame, 392 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 376 bytes stack frame, 408 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 712 bytes spill stores, 928 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 440 bytes stack frame, 692 bytes spill stores, 1232 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 452 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 812 bytes spill stores, 1100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 424 bytes stack frame, 588 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 392 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 824 bytes spill stores, 1116 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 712 bytes spill stores, 928 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 432 bytes stack frame, 672 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 544 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 796 bytes spill stores, 1084 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 416 bytes stack frame, 624 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 512 bytes stack frame, 392 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 816 bytes spill stores, 1108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 472 bytes spill stores, 584 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 472 bytes spill stores, 584 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 524 bytes spill stores, 648 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 456 bytes stack frame, 696 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 552 bytes stack frame, 416 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 732 bytes spill stores, 968 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 400 bytes stack frame, 636 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 764 bytes spill stores, 964 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 544 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 544 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 544 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 440 bytes stack frame, 672 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 636 bytes spill stores, 840 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 384 bytes stack frame, 540 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 648 bytes spill stores, 780 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 344 bytes stack frame, 440 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 560 bytes stack frame, 464 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 500 bytes spill stores, 628 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 400 bytes stack frame, 536 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 776 bytes spill stores, 1024 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 808 bytes spill stores, 1012 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 512 bytes stack frame, 340 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 572 bytes spill stores, 724 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 512 bytes stack frame, 348 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 572 bytes spill stores, 724 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 512 bytes stack frame, 348 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 572 bytes spill stores, 724 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 636 bytes spill stores, 840 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 440 bytes stack frame, 676 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 652 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 384 bytes stack frame, 596 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_NVLS_SIMPLEv 376 bytes stack frame, 680 bytes spill stores, 1124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 696 bytes spill stores, 856 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 640 bytes spill stores, 784 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 712 bytes spill stores, 928 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 432 bytes stack frame, 636 bytes spill stores, 1184 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 544 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 776 bytes spill stores, 1064 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 376 bytes stack frame, 528 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 504 bytes stack frame, 396 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_NVLS_SIMPLEv 376 bytes stack frame, 692 bytes spill stores, 1132 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 800 bytes spill stores, 1092 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 344 bytes stack frame, 456 bytes spill stores, 568 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 344 bytes stack frame, 456 bytes spill stores, 568 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 344 bytes stack frame, 456 bytes spill stores, 568 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 496 bytes spill stores, 664 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 416 bytes stack frame, 632 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 544 bytes stack frame, 408 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 776 bytes spill stores, 1028 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 360 bytes stack frame, 452 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 464 bytes stack frame, 812 bytes spill stores, 1024 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 504 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 504 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 504 bytes spill stores, 624 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 384 bytes stack frame, 428 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 536 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 772 bytes spill stores, 1012 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 372 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_NVLS_SIMPLEv 376 bytes stack frame, 692 bytes spill stores, 1132 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 816 bytes spill stores, 1068 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 392 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 488 bytes stack frame, 328 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 608 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 392 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 488 bytes stack frame, 328 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 608 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 392 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 488 bytes stack frame, 328 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 608 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 368 bytes stack frame, 420 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 520 bytes stack frame, 400 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 628 bytes spill stores, 852 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 512 bytes stack frame, 780 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1080 bytes spill stores, 1476 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 472 bytes stack frame, 792 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1128 bytes spill stores, 1456 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 472 bytes spill stores, 536 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 472 bytes spill stores, 536 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 472 bytes spill stores, 536 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 528 bytes spill stores, 632 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 432 bytes stack frame, 640 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 596 bytes spill stores, 804 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_NVLS_SIMPLEv 384 bytes stack frame, 716 bytes spill stores, 1052 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 600 bytes spill stores, 736 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 504 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 452 bytes spill stores, 496 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 504 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 452 bytes spill stores, 496 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 504 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 452 bytes spill stores, 496 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 524 bytes spill stores, 636 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 432 bytes stack frame, 644 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 656 bytes spill stores, 880 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 368 bytes stack frame, 512 bytes spill stores, 948 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_NVLS_SIMPLEv 384 bytes stack frame, 716 bytes spill stores, 1052 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 700 bytes spill stores, 908 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 532 bytes spill stores, 640 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 440 bytes stack frame, 640 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 648 bytes spill stores, 860 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_NVLS_SIMPLEv 376 bytes stack frame, 732 bytes spill stores, 1052 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 692 bytes spill stores, 892 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 344 bytes stack frame, 388 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 496 bytes stack frame, 336 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 608 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 344 bytes stack frame, 388 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 496 bytes stack frame, 336 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 608 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 344 bytes stack frame, 388 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 496 bytes stack frame, 336 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 608 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 368 bytes stack frame, 420 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 544 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 632 bytes spill stores, 852 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 504 bytes stack frame, 796 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1084 bytes spill stores, 1480 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 480 bytes stack frame, 816 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1136 bytes spill stores, 1460 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 604 bytes spill stores, 648 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 336 bytes stack frame, 436 bytes spill stores, 500 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 328 bytes stack frame, 428 bytes spill stores, 488 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 544 bytes stack frame, 424 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 504 bytes spill stores, 632 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 424 bytes stack frame, 644 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 660 bytes spill stores, 856 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 692 bytes spill stores, 884 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 552 bytes spill stores, 1300 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 608 bytes spill stores, 776 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 552 bytes spill stores, 1300 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 608 bytes spill stores, 776 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 552 bytes spill stores, 1300 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 608 bytes spill stores, 776 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 424 bytes stack frame, 660 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 536 bytes stack frame, 404 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 648 bytes spill stores, 860 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 712 bytes stack frame, 1128 bytes spill stores, 1840 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 528 bytes stack frame, 392 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1096 bytes spill stores, 1492 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 496 bytes stack frame, 828 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 664 bytes stack frame, 1294 bytes spill stores, 1668 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 464 bytes stack frame, 304 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 456 bytes stack frame, 296 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 456 bytes stack frame, 296 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 376 bytes stack frame, 600 bytes spill stores, 1168 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 296 bytes stack frame, 364 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 432 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 432 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 432 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 248 bytes stack frame, 264 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 480 bytes stack frame, 364 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 344 bytes stack frame, 412 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 480 bytes stack frame, 360 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 432 bytes stack frame, 268 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 456 bytes stack frame, 288 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 504 bytes stack frame, 356 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 352 bytes stack frame, 428 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 488 bytes stack frame, 352 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 480 bytes stack frame, 348 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 344 bytes stack frame, 408 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 344 bytes stack frame, 440 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 524 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 344 bytes stack frame, 372 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 560 bytes stack frame, 464 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 500 bytes spill stores, 628 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 400 bytes stack frame, 536 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 776 bytes spill stores, 1024 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_NVLS_SIMPLEv 416 bytes stack frame, 932 bytes spill stores, 1436 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 808 bytes spill stores, 1012 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 464 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 464 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 248 bytes stack frame, 264 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 384 bytes stack frame, 640 bytes spill stores, 1232 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 304 bytes stack frame, 396 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_gather.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 360 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 576 bytes spill stores, 776 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 368 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 576 bytes spill stores, 776 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 368 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 576 bytes spill stores, 776 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 368 bytes stack frame, 428 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 424 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 652 bytes spill stores, 932 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 488 bytes stack frame, 928 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 424 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1024 bytes spill stores, 1400 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 440 bytes stack frame, 844 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 384 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_AllGather_NVLS_SIMPLEv 488 bytes stack frame, 1504 bytes spill stores, 2476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1380 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 400 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 480 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 616 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 400 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 480 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 616 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 400 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 480 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 616 bytes spill stores, 760 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 368 bytes stack frame, 408 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 652 bytes spill stores, 864 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 512 bytes stack frame, 820 bytes spill stores, 1548 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 528 bytes stack frame, 404 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1084 bytes spill stores, 1472 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 488 bytes stack frame, 820 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1128 bytes spill stores, 1444 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 436 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 436 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 436 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 544 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 516 bytes spill stores, 640 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 400 bytes stack frame, 536 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 776 bytes spill stores, 1024 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 808 bytes spill stores, 1012 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 448 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 448 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 448 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 336 bytes stack frame, 428 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 480 bytes stack frame, 328 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 640 bytes stack frame, 1140 bytes spill stores, 1784 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 352 bytes stack frame, 640 bytes spill stores, 1036 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 448 bytes stack frame, 304 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 472 bytes stack frame, 332 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 328 bytes stack frame, 388 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 232 bytes stack frame, 228 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 448 bytes stack frame, 288 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 360 bytes stack frame, 436 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 488 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 344 bytes stack frame, 384 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 488 bytes stack frame, 348 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 240 bytes stack frame, 268 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 328 bytes stack frame, 452 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 452 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 328 bytes stack frame, 452 bytes spill stores, 532 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 552 bytes stack frame, 440 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 508 bytes spill stores, 632 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 472 bytes stack frame, 728 bytes spill stores, 1356 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 840 bytes spill stores, 1108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 416 bytes stack frame, 668 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 480 bytes stack frame, 356 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_NVLS_SIMPLEv 432 bytes stack frame, 964 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 868 bytes spill stores, 1100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 384 bytes stack frame, 640 bytes spill stores, 1232 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 396 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 436 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 436 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 436 bytes spill stores, 516 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 544 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 516 bytes spill stores, 640 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 400 bytes stack frame, 536 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 776 bytes spill stores, 1024 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_NVLS_SIMPLEv 416 bytes stack frame, 932 bytes spill stores, 1436 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 808 bytes spill stores, 1012 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 360 bytes stack frame, 436 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 488 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 456 bytes stack frame, 312 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 456 bytes stack frame, 304 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 456 bytes stack frame, 304 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 368 bytes stack frame, 548 bytes spill stores, 1004 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 288 bytes stack frame, 352 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 432 bytes stack frame, 268 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 368 bytes stack frame, 476 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 676 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 368 bytes stack frame, 476 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 676 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 368 bytes stack frame, 476 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 676 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 472 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 544 bytes stack frame, 432 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 708 bytes spill stores, 996 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 544 bytes stack frame, 800 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1152 bytes spill stores, 1544 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 496 bytes stack frame, 804 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1176 bytes spill stores, 1496 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 416 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 416 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 472 bytes stack frame, 332 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 320 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 456 bytes stack frame, 296 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 224 bytes stack frame, 220 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 676 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 676 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 676 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 400 bytes stack frame, 472 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 544 bytes stack frame, 432 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 668 bytes spill stores, 892 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 552 bytes stack frame, 800 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1124 bytes spill stores, 1512 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 496 bytes stack frame, 804 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1176 bytes spill stores, 1496 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 472 bytes stack frame, 340 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 376 bytes stack frame, 448 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 456 bytes stack frame, 312 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 416 bytes stack frame, 252 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 432 bytes stack frame, 300 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 432 bytes stack frame, 300 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 496 bytes stack frame, 368 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 480 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 432 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 456 bytes stack frame, 808 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 532 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 456 bytes stack frame, 760 bytes spill stores, 1316 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 488 bytes stack frame, 352 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 624 bytes spill stores, 976 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 440 bytes stack frame, 304 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 248 bytes stack frame, 264 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 480 bytes stack frame, 356 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 480 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 248 bytes stack frame, 264 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 472 bytes stack frame, 348 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 344 bytes stack frame, 384 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 488 bytes stack frame, 348 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 240 bytes stack frame, 268 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 336 bytes stack frame, 456 bytes spill stores, 560 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 336 bytes stack frame, 456 bytes spill stores, 560 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 336 bytes stack frame, 456 bytes spill stores, 560 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 536 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 516 bytes spill stores, 672 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 472 bytes stack frame, 728 bytes spill stores, 1356 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 840 bytes spill stores, 1108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 416 bytes stack frame, 668 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 480 bytes stack frame, 356 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_NVLS_SIMPLEv 432 bytes stack frame, 964 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 868 bytes spill stores, 1100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 416 bytes spill stores, 464 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 416 bytes spill stores, 464 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 320 bytes stack frame, 416 bytes spill stores, 464 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 352 bytes stack frame, 380 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 504 bytes spill stores, 628 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 424 bytes stack frame, 644 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 656 bytes spill stores, 852 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 692 bytes spill stores, 884 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 320 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1116 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 320 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1116 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 320 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1116 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 400 bytes stack frame, 780 bytes spill stores, 1568 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 344 bytes stack frame, 500 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 544 bytes stack frame, 428 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1224 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 544 bytes stack frame, 1200 bytes spill stores, 1956 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 360 bytes stack frame, 512 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 384 bytes stack frame, 472 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1756 bytes spill stores, 2784 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 772 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 528 bytes stack frame, 1228 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 284 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 336 bytes stack frame, 496 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 736 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1836 bytes spill stores, 2716 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 312 bytes spill stores, 428 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 480 bytes stack frame, 328 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1108 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 256 bytes spill stores, 276 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 480 bytes stack frame, 312 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1108 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 256 bytes spill stores, 276 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 480 bytes stack frame, 312 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1108 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 256 bytes spill stores, 276 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 400 bytes stack frame, 788 bytes spill stores, 1576 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 560 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1236 bytes spill stores, 2260 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 544 bytes stack frame, 1240 bytes spill stores, 2008 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 560 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 368 bytes stack frame, 520 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 424 bytes stack frame, 640 bytes spill stores, 1568 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1864 bytes spill stores, 2864 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 1228 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 536 bytes stack frame, 1288 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 488 bytes stack frame, 288 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 344 bytes stack frame, 504 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 344 bytes stack frame, 412 bytes spill stores, 1008 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 456 bytes stack frame, 308 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 744 bytes stack frame, 2108 bytes spill stores, 2996 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 432 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 360 bytes stack frame, 672 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1048 bytes spill stores, 1608 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 360 bytes stack frame, 672 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 496 bytes stack frame, 332 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1048 bytes spill stores, 1608 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 360 bytes stack frame, 672 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 496 bytes stack frame, 332 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1048 bytes spill stores, 1608 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 400 bytes stack frame, 736 bytes spill stores, 1444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 344 bytes stack frame, 504 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 544 bytes stack frame, 428 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1332 bytes spill stores, 2300 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 544 bytes stack frame, 1224 bytes spill stores, 2048 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 360 bytes stack frame, 516 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 432 bytes stack frame, 764 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1604 bytes spill stores, 2804 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 1532 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 496 bytes stack frame, 1248 bytes spill stores, 1904 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 504 bytes stack frame, 316 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 336 bytes stack frame, 496 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 384 bytes stack frame, 496 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1644 bytes spill stores, 2568 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 392 bytes stack frame, 848 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 472 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1132 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 392 bytes stack frame, 848 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 472 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1132 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 392 bytes stack frame, 848 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 472 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1132 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 416 bytes stack frame, 860 bytes spill stores, 1552 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 360 bytes stack frame, 544 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1208 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 536 bytes stack frame, 1272 bytes spill stores, 2100 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 368 bytes stack frame, 556 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 424 bytes stack frame, 744 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1364 bytes spill stores, 2396 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 464 bytes spill stores, 1540 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 504 bytes stack frame, 1204 bytes spill stores, 2168 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 472 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 1132 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1520 bytes spill stores, 2284 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 360 bytes stack frame, 712 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 504 bytes stack frame, 340 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1216 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 280 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 360 bytes stack frame, 712 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 504 bytes stack frame, 340 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1216 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 280 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 360 bytes stack frame, 712 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 504 bytes stack frame, 340 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1216 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 280 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 400 bytes stack frame, 708 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 408 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1368 bytes spill stores, 2328 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 568 bytes stack frame, 1280 bytes spill stores, 2076 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 552 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 368 bytes stack frame, 520 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 456 bytes stack frame, 804 bytes spill stores, 1860 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1644 bytes spill stores, 2816 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 472 bytes spill stores, 1548 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 528 bytes stack frame, 1332 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 488 bytes stack frame, 292 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 344 bytes stack frame, 504 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 376 bytes stack frame, 696 bytes spill stores, 1504 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 316 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1712 bytes spill stores, 2604 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 512 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1096 bytes spill stores, 1616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 512 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1096 bytes spill stores, 1616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 512 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1096 bytes spill stores, 1616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 416 bytes stack frame, 860 bytes spill stores, 1552 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 360 bytes stack frame, 536 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1220 bytes spill stores, 1856 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 552 bytes stack frame, 1276 bytes spill stores, 2100 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 368 bytes stack frame, 548 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 424 bytes stack frame, 744 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1484 bytes spill stores, 2380 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 464 bytes spill stores, 1540 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 504 bytes stack frame, 1184 bytes spill stores, 2088 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 480 bytes stack frame, 292 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 1132 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1600 bytes spill stores, 2356 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1064 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 496 bytes stack frame, 348 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1064 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 496 bytes stack frame, 348 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1064 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 416 bytes stack frame, 868 bytes spill stores, 1560 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 368 bytes stack frame, 548 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1228 bytes spill stores, 2108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 536 bytes stack frame, 1292 bytes spill stores, 2108 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 376 bytes stack frame, 560 bytes spill stores, 904 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 424 bytes stack frame, 748 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 528 bytes stack frame, 392 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1476 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 1532 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 496 bytes stack frame, 1224 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 488 bytes stack frame, 304 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 360 bytes stack frame, 424 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1592 bytes spill stores, 2356 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 352 bytes spill stores, 460 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 392 bytes stack frame, 864 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1116 bytes spill stores, 1628 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 392 bytes stack frame, 864 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1116 bytes spill stores, 1628 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 392 bytes stack frame, 864 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1116 bytes spill stores, 1628 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 408 bytes stack frame, 864 bytes spill stores, 1548 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 368 bytes stack frame, 548 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1204 bytes spill stores, 1736 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 552 bytes stack frame, 1300 bytes spill stores, 2116 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 376 bytes stack frame, 560 bytes spill stores, 904 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 424 bytes stack frame, 748 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 528 bytes stack frame, 392 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1392 bytes spill stores, 2400 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 1400 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 504 bytes stack frame, 1276 bytes spill stores, 2216 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 360 bytes stack frame, 428 bytes spill stores, 1120 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1452 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 360 bytes spill stores, 468 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 384 bytes stack frame, 796 bytes spill stores, 1364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 504 bytes stack frame, 332 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 312 bytes stack frame, 500 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 312 bytes stack frame, 360 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1208 bytes spill stores, 1628 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 384 bytes stack frame, 796 bytes spill stores, 1364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 504 bytes stack frame, 332 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 312 bytes stack frame, 500 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 312 bytes stack frame, 360 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1208 bytes spill stores, 1628 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 384 bytes stack frame, 796 bytes spill stores, 1364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 504 bytes stack frame, 332 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 312 bytes stack frame, 500 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 312 bytes stack frame, 360 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1208 bytes spill stores, 1628 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 416 bytes stack frame, 908 bytes spill stores, 1548 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 568 bytes stack frame, 384 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 344 bytes stack frame, 488 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 352 bytes stack frame, 420 bytes spill stores, 928 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 520 bytes stack frame, 384 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1256 bytes spill stores, 1912 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 400 bytes spill stores, 728 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 496 bytes stack frame, 1308 bytes spill stores, 2204 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 568 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 360 bytes stack frame, 504 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 472 bytes stack frame, 864 bytes spill stores, 2184 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1688 bytes spill stores, 2924 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 400 bytes stack frame, 736 bytes spill stores, 2296 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 480 bytes stack frame, 1300 bytes spill stores, 2156 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 488 bytes stack frame, 284 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 336 bytes stack frame, 488 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 392 bytes stack frame, 712 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 464 bytes stack frame, 308 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 664 bytes stack frame, 1784 bytes spill stores, 2828 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 400 bytes spill stores, 1032 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 376 bytes stack frame, 772 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 340 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 300 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1060 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 376 bytes stack frame, 772 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 340 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 300 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1060 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 376 bytes stack frame, 772 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 340 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 328 bytes stack frame, 524 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 300 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1060 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 400 bytes stack frame, 836 bytes spill stores, 1544 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 560 bytes stack frame, 404 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 352 bytes stack frame, 508 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1164 bytes spill stores, 1708 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 536 bytes stack frame, 1300 bytes spill stores, 2088 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 560 bytes stack frame, 404 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 368 bytes stack frame, 520 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 432 bytes stack frame, 780 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1396 bytes spill stores, 2376 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 472 bytes spill stores, 1544 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 528 bytes stack frame, 1284 bytes spill stores, 2016 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 320 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 344 bytes stack frame, 504 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 376 bytes stack frame, 564 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_u64_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1252 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_NVLS_SIMPLEv 600 bytes stack frame, 1868 bytes spill stores, 3440 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1460 bytes spill stores, 2304 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 536 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 432 bytes stack frame, 1048 bytes spill stores, 1824 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 568 bytes stack frame, 436 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1064 bytes spill stores, 1624 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 368 bytes stack frame, 776 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 312 bytes stack frame, 500 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1156 bytes spill stores, 1660 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 432 bytes stack frame, 1048 bytes spill stores, 1824 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 568 bytes stack frame, 464 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1064 bytes spill stores, 1624 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 400 bytes stack frame, 860 bytes spill stores, 1552 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 344 bytes stack frame, 500 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 560 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1324 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 552 bytes stack frame, 1448 bytes spill stores, 2336 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 424 bytes stack frame, 564 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1608 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 884 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 520 bytes stack frame, 1296 bytes spill stores, 2264 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 336 bytes stack frame, 492 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 376 bytes stack frame, 480 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1748 bytes spill stores, 2580 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 340 bytes spill stores, 464 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 392 bytes stack frame, 892 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 456 bytes stack frame, 308 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1192 bytes spill stores, 1712 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 392 bytes stack frame, 892 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 456 bytes stack frame, 308 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1192 bytes spill stores, 1712 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 392 bytes stack frame, 892 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 456 bytes stack frame, 308 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1192 bytes spill stores, 1712 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 416 bytes stack frame, 948 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 560 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 360 bytes stack frame, 544 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 932 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1236 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 552 bytes stack frame, 1364 bytes spill stores, 2268 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 560 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 368 bytes stack frame, 556 bytes spill stores, 884 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 448 bytes stack frame, 804 bytes spill stores, 1796 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 512 bytes stack frame, 364 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1628 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 400 bytes stack frame, 728 bytes spill stores, 2288 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 528 bytes stack frame, 1368 bytes spill stores, 2220 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 536 bytes stack frame, 364 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 352 bytes stack frame, 544 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 368 bytes stack frame, 472 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 456 bytes stack frame, 320 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1652 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 384 bytes spill stores, 1116 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 320 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1116 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 320 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1116 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 384 bytes stack frame, 792 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 320 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1116 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 276 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 400 bytes stack frame, 780 bytes spill stores, 1568 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 344 bytes stack frame, 500 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 544 bytes stack frame, 428 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1224 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 272 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 544 bytes stack frame, 1200 bytes spill stores, 1956 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 360 bytes stack frame, 512 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 384 bytes stack frame, 472 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1756 bytes spill stores, 2784 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 772 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 252 bytes spill stores, 448 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 528 bytes stack frame, 1228 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 284 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 336 bytes stack frame, 496 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 328 bytes stack frame, 372 bytes spill stores, 736 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_f64_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1224 bytes spill stores, 2116 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_NVLS_SIMPLEv 600 bytes stack frame, 1860 bytes spill stores, 3432 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1836 bytes spill stores, 2716 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 312 bytes spill stores, 428 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 360 bytes stack frame, 688 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 504 bytes stack frame, 376 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1220 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 360 bytes stack frame, 688 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 504 bytes stack frame, 376 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1220 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 360 bytes stack frame, 688 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 320 bytes stack frame, 516 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 504 bytes stack frame, 376 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1220 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 276 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 400 bytes stack frame, 824 bytes spill stores, 1532 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 344 bytes stack frame, 512 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 544 bytes stack frame, 428 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1308 bytes spill stores, 2256 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 272 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 568 bytes stack frame, 1304 bytes spill stores, 2128 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 360 bytes stack frame, 520 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 424 bytes stack frame, 752 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 552 bytes stack frame, 436 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1756 bytes spill stores, 2924 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 460 bytes spill stores, 1400 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 252 bytes spill stores, 448 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 520 bytes stack frame, 1356 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 488 bytes stack frame, 300 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 336 bytes stack frame, 496 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 360 bytes stack frame, 472 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 488 bytes stack frame, 364 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_u64_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1224 bytes spill stores, 2116 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_NVLS_SIMPLEv 600 bytes stack frame, 1860 bytes spill stores, 3432 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1896 bytes spill stores, 2788 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 384 bytes stack frame, 848 bytes spill stores, 1448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 480 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 472 bytes stack frame, 296 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1112 bytes spill stores, 1532 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 384 bytes stack frame, 848 bytes spill stores, 1448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 480 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 472 bytes stack frame, 304 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1112 bytes spill stores, 1532 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 384 bytes stack frame, 848 bytes spill stores, 1448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 480 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 472 bytes stack frame, 304 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1112 bytes spill stores, 1532 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 416 bytes stack frame, 932 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 568 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 360 bytes stack frame, 540 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1204 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 544 bytes stack frame, 1300 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 568 bytes stack frame, 392 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 376 bytes stack frame, 552 bytes spill stores, 884 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 432 bytes stack frame, 736 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 520 bytes stack frame, 356 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1356 bytes spill stores, 2344 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 360 bytes stack frame, 456 bytes spill stores, 1392 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 520 bytes stack frame, 1312 bytes spill stores, 2244 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 464 bytes stack frame, 268 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 360 bytes stack frame, 452 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 448 bytes stack frame, 300 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_u32_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1268 bytes spill stores, 2128 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_NVLS_SIMPLEv 600 bytes stack frame, 1868 bytes spill stores, 3440 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1488 bytes spill stores, 2304 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 384 bytes stack frame, 864 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1120 bytes spill stores, 1640 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 384 bytes stack frame, 864 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1120 bytes spill stores, 1640 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 384 bytes stack frame, 864 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1120 bytes spill stores, 1640 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 416 bytes stack frame, 920 bytes spill stores, 1596 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 368 bytes stack frame, 540 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1196 bytes spill stores, 1732 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 308 bytes spill stores, 404 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 544 bytes stack frame, 1288 bytes spill stores, 2100 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 376 bytes stack frame, 552 bytes spill stores, 892 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 440 bytes stack frame, 784 bytes spill stores, 1820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1544 bytes spill stores, 2484 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 500 bytes spill stores, 1576 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 496 bytes stack frame, 1200 bytes spill stores, 2136 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 472 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 892 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 376 bytes stack frame, 528 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1608 bytes spill stores, 2424 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 396 bytes spill stores, 676 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 264 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 384 bytes stack frame, 824 bytes spill stores, 1396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1112 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 264 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 384 bytes stack frame, 824 bytes spill stores, 1396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1112 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 264 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 384 bytes stack frame, 824 bytes spill stores, 1396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1112 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 296 bytes spill stores, 476 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 416 bytes stack frame, 860 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 360 bytes stack frame, 544 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1196 bytes spill stores, 1732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 296 bytes spill stores, 476 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 544 bytes stack frame, 1284 bytes spill stores, 2096 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 368 bytes stack frame, 556 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 424 bytes stack frame, 744 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1344 bytes spill stores, 2348 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 464 bytes spill stores, 1544 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 440 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 496 bytes stack frame, 1188 bytes spill stores, 2144 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 472 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 352 bytes stack frame, 428 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_u32_NVLS_TREE_SIMPLEv 488 bytes stack frame, 1308 bytes spill stores, 2260 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_NVLS_SIMPLEv 600 bytes stack frame, 1860 bytes spill stores, 3432 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1516 bytes spill stores, 2264 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 264 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 512 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1096 bytes spill stores, 1616 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 264 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 512 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1096 bytes spill stores, 1616 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 264 bytes spill stores, 464 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 392 bytes stack frame, 844 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 512 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 480 bytes stack frame, 324 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1096 bytes spill stores, 1616 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 296 bytes spill stores, 476 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 416 bytes stack frame, 860 bytes spill stores, 1552 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 360 bytes stack frame, 536 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1220 bytes spill stores, 1856 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 296 bytes spill stores, 476 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 552 bytes stack frame, 1276 bytes spill stores, 2100 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 368 bytes stack frame, 548 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 424 bytes stack frame, 744 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1484 bytes spill stores, 2380 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 464 bytes spill stores, 1540 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 268 bytes spill stores, 440 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 504 bytes stack frame, 1184 bytes spill stores, 2088 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 480 bytes stack frame, 292 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 1132 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_f32_NVLS_TREE_SIMPLEv 584 bytes stack frame, 1448 bytes spill stores, 2140 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_NVLS_SIMPLEv 608 bytes stack frame, 1936 bytes spill stores, 3408 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1600 bytes spill stores, 2356 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 384 bytes stack frame, 932 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1128 bytes spill stores, 1644 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 384 bytes stack frame, 932 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1128 bytes spill stores, 1644 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 384 bytes stack frame, 932 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 480 bytes stack frame, 308 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 328 bytes stack frame, 544 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 472 bytes stack frame, 316 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1128 bytes spill stores, 1644 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 416 bytes stack frame, 920 bytes spill stores, 1596 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 368 bytes stack frame, 540 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1196 bytes spill stores, 1732 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 304 bytes spill stores, 400 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 544 bytes stack frame, 1272 bytes spill stores, 2084 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 376 bytes stack frame, 552 bytes spill stores, 892 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 440 bytes stack frame, 776 bytes spill stores, 1816 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1520 bytes spill stores, 2416 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 360 bytes stack frame, 496 bytes spill stores, 1432 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 496 bytes stack frame, 1200 bytes spill stores, 2136 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 472 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 892 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 360 bytes stack frame, 496 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1596 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 368 bytes spill stores, 1052 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 368 bytes stack frame, 792 bytes spill stores, 1480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 352 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 504 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1872 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 300 bytes spill stores, 456 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 368 bytes stack frame, 792 bytes spill stores, 1480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 352 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 504 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1872 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 300 bytes spill stores, 456 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 368 bytes stack frame, 792 bytes spill stores, 1480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 352 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 504 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1872 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 300 bytes spill stores, 456 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 400 bytes stack frame, 780 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 504 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 360 bytes stack frame, 428 bytes spill stores, 940 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 528 bytes stack frame, 412 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1312 bytes spill stores, 2260 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 308 bytes spill stores, 404 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 568 bytes stack frame, 1360 bytes spill stores, 2208 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 376 bytes stack frame, 516 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 456 bytes stack frame, 828 bytes spill stores, 1856 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 536 bytes stack frame, 420 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1964 bytes spill stores, 3136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 500 bytes spill stores, 1440 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 528 bytes stack frame, 1388 bytes spill stores, 2076 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 512 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 376 bytes stack frame, 548 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 392 bytes stack frame, 524 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1940 bytes spill stores, 3040 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 480 bytes spill stores, 1328 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 360 bytes stack frame, 788 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 344 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 368 bytes stack frame, 492 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 368 bytes stack frame, 456 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1328 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 256 bytes stack frame, 284 bytes spill stores, 396 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 360 bytes stack frame, 788 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 368 bytes stack frame, 492 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 368 bytes stack frame, 456 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1328 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 256 bytes stack frame, 284 bytes spill stores, 396 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 360 bytes stack frame, 788 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 368 bytes stack frame, 492 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 368 bytes stack frame, 456 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1328 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 256 bytes stack frame, 284 bytes spill stores, 396 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 400 bytes stack frame, 828 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 536 bytes stack frame, 396 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 392 bytes stack frame, 528 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 376 bytes stack frame, 452 bytes spill stores, 996 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 528 bytes stack frame, 412 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1452 bytes spill stores, 2348 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 568 bytes stack frame, 1352 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 536 bytes stack frame, 376 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 392 bytes stack frame, 532 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 488 bytes stack frame, 892 bytes spill stores, 1888 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 536 bytes stack frame, 420 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 2048 bytes spill stores, 3272 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 572 bytes spill stores, 1536 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 528 bytes stack frame, 1420 bytes spill stores, 2136 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 512 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 408 bytes stack frame, 556 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 416 bytes stack frame, 628 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 2020 bytes spill stores, 3172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 412 bytes spill stores, 1172 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 432 bytes stack frame, 1064 bytes spill stores, 1884 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 576 bytes stack frame, 452 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 448 bytes stack frame, 308 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1254 bytes spill stores, 1670 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 408 bytes stack frame, 840 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 320 bytes stack frame, 500 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1164 bytes spill stores, 1560 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 432 bytes stack frame, 1064 bytes spill stores, 1884 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 576 bytes stack frame, 468 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 448 bytes stack frame, 308 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 480 bytes stack frame, 1164 bytes spill stores, 1612 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 440 bytes stack frame, 880 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1296 bytes spill stores, 1800 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 592 bytes stack frame, 1608 bytes spill stores, 2564 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 360 bytes stack frame, 520 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 456 bytes stack frame, 808 bytes spill stores, 1772 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 536 bytes stack frame, 416 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1648 bytes spill stores, 2624 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 1228 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 552 bytes stack frame, 1496 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 376 bytes stack frame, 484 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1732 bytes spill stores, 2580 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 340 bytes spill stores, 464 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 464 bytes stack frame, 1168 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 512 bytes stack frame, 348 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 1952 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1540 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 428 bytes spill stores, 996 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 464 bytes stack frame, 1168 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 512 bytes stack frame, 348 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 1952 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1540 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 428 bytes spill stores, 996 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 464 bytes stack frame, 1168 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 512 bytes stack frame, 348 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 1952 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1540 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 428 bytes spill stores, 996 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 504 bytes stack frame, 1300 bytes spill stores, 2140 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 544 bytes stack frame, 380 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 416 bytes stack frame, 752 bytes spill stores, 1676 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 488 bytes stack frame, 344 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1632 bytes spill stores, 2392 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 508 bytes spill stores, 1228 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 1064 bytes stack frame, 2996 bytes spill stores, 4200 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 544 bytes stack frame, 380 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 376 bytes stack frame, 536 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 712 bytes stack frame, 1232 bytes spill stores, 2628 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 496 bytes stack frame, 348 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 784 bytes stack frame, 3348 bytes spill stores, 5696 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 664 bytes stack frame, 1376 bytes spill stores, 3064 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 1120 bytes stack frame, 3516 bytes spill stores, 4536 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 480 bytes stack frame, 284 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 892 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 584 bytes stack frame, 1144 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 440 bytes stack frame, 304 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 928 bytes stack frame, 4084 bytes spill stores, 6812 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 424 bytes stack frame, 1304 bytes spill stores, 2436 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 424 bytes stack frame, 1004 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 436 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1656 bytes spill stores, 2240 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 308 bytes spill stores, 448 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 424 bytes stack frame, 1004 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 436 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1656 bytes spill stores, 2240 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 308 bytes spill stores, 448 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 424 bytes stack frame, 1004 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 488 bytes stack frame, 324 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 436 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 448 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1656 bytes spill stores, 2240 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 308 bytes spill stores, 448 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1896 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 536 bytes stack frame, 372 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 360 bytes stack frame, 512 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 384 bytes stack frame, 488 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 488 bytes stack frame, 344 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1636 bytes spill stores, 2320 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 624 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 992 bytes stack frame, 2948 bytes spill stores, 4232 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 536 bytes stack frame, 368 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 592 bytes stack frame, 1176 bytes spill stores, 2536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 3440 bytes spill stores, 5768 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 480 bytes stack frame, 1232 bytes spill stores, 2924 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 1000 bytes stack frame, 2984 bytes spill stores, 4060 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 488 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 352 bytes stack frame, 520 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 536 bytes stack frame, 1096 bytes spill stores, 1964 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 880 bytes stack frame, 3852 bytes spill stores, 6436 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 424 bytes stack frame, 1252 bytes spill stores, 2632 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 312 bytes spill stores, 496 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 544 bytes stack frame, 1316 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 472 bytes stack frame, 284 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 460 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 440 bytes stack frame, 276 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1556 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 312 bytes spill stores, 496 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 544 bytes stack frame, 1316 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 472 bytes stack frame, 284 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 460 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 432 bytes stack frame, 272 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1556 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 312 bytes spill stores, 496 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 544 bytes stack frame, 1316 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 472 bytes stack frame, 284 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 460 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 432 bytes stack frame, 272 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1556 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 296 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 584 bytes stack frame, 1440 bytes spill stores, 2396 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 360 bytes stack frame, 520 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 384 bytes stack frame, 484 bytes spill stores, 1064 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 480 bytes stack frame, 328 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1564 bytes spill stores, 2232 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 660 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 308 bytes spill stores, 524 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 1000 bytes stack frame, 2864 bytes spill stores, 4156 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 544 bytes stack frame, 376 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 592 bytes stack frame, 1140 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 856 bytes stack frame, 3412 bytes spill stores, 5736 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1240 bytes spill stores, 2736 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 276 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 984 bytes stack frame, 3044 bytes spill stores, 4096 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 464 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 352 bytes stack frame, 520 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 528 bytes stack frame, 1080 bytes spill stores, 1964 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 432 bytes stack frame, 288 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 3848 bytes spill stores, 6400 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 424 bytes stack frame, 1288 bytes spill stores, 2428 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 424 bytes stack frame, 884 bytes spill stores, 1572 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 672 bytes stack frame, 248 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1124 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 424 bytes stack frame, 884 bytes spill stores, 1572 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 672 bytes stack frame, 248 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1124 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 424 bytes stack frame, 884 bytes spill stores, 1572 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 672 bytes stack frame, 248 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1124 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 448 bytes stack frame, 912 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 704 bytes stack frame, 288 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1320 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 400 bytes spill stores, 736 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 552 bytes stack frame, 1448 bytes spill stores, 2320 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 424 bytes stack frame, 564 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1608 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 884 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 520 bytes stack frame, 1296 bytes spill stores, 2264 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 336 bytes stack frame, 492 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 376 bytes stack frame, 480 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1748 bytes spill stores, 2580 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 340 bytes spill stores, 464 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 300 bytes spill stores, 508 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 432 bytes stack frame, 1048 bytes spill stores, 1824 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 568 bytes stack frame, 436 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1064 bytes spill stores, 1624 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 272 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 368 bytes stack frame, 776 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 496 bytes stack frame, 336 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 312 bytes stack frame, 500 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1156 bytes spill stores, 1660 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 300 bytes spill stores, 508 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 432 bytes stack frame, 1048 bytes spill stores, 1824 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 568 bytes stack frame, 464 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 488 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1064 bytes spill stores, 1624 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 280 bytes spill stores, 484 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 400 bytes stack frame, 860 bytes spill stores, 1552 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 344 bytes stack frame, 500 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 560 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1324 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 388 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 288 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 552 bytes stack frame, 1448 bytes spill stores, 2336 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 424 bytes stack frame, 564 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1608 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 884 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 128 bytes stack frame, 252 bytes spill stores, 448 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 520 bytes stack frame, 1296 bytes spill stores, 2264 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 336 bytes stack frame, 492 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 376 bytes stack frame, 480 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_f16_NVLS_TREE_SIMPLEv 608 bytes stack frame, 1548 bytes spill stores, 2268 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_NVLS_SIMPLEv 640 bytes stack frame, 2020 bytes spill stores, 3480 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1748 bytes spill stores, 2580 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 340 bytes spill stores, 464 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 544 bytes stack frame, 1460 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 816 bytes stack frame, 404 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 1120 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 440 bytes stack frame, 276 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1596 bytes spill stores, 2296 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 308 bytes spill stores, 444 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 544 bytes stack frame, 1460 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 816 bytes stack frame, 404 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 1120 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 268 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1596 bytes spill stores, 2296 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 308 bytes spill stores, 444 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 544 bytes stack frame, 1460 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 816 bytes stack frame, 404 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 456 bytes spill stores, 1120 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 268 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1596 bytes spill stores, 2296 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 272 bytes stack frame, 308 bytes spill stores, 444 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 600 bytes stack frame, 1576 bytes spill stores, 2508 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 840 bytes stack frame, 436 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 496 bytes stack frame, 344 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1648 bytes spill stores, 2684 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 344 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 1296 bytes stack frame, 3876 bytes spill stores, 5032 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 848 bytes stack frame, 444 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 376 bytes stack frame, 536 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 584 bytes stack frame, 1184 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 504 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 3284 bytes spill stores, 5512 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1252 bytes spill stores, 2744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 1280 bytes stack frame, 4004 bytes spill stores, 4956 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 800 bytes stack frame, 388 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 504 bytes stack frame, 1132 bytes spill stores, 2008 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 744 bytes stack frame, 3644 bytes spill stores, 6236 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 424 bytes stack frame, 1300 bytes spill stores, 2460 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 424 bytes stack frame, 968 bytes spill stores, 1660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 568 bytes stack frame, 436 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 472 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1240 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 424 bytes stack frame, 976 bytes spill stores, 1668 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 568 bytes stack frame, 464 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 464 bytes stack frame, 332 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1240 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 424 bytes stack frame, 980 bytes spill stores, 1676 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 568 bytes stack frame, 464 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 472 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1240 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 448 bytes stack frame, 1048 bytes spill stores, 1804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 616 bytes stack frame, 484 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1332 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 400 bytes spill stores, 736 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 528 bytes stack frame, 1500 bytes spill stores, 2448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 360 bytes stack frame, 516 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 480 bytes stack frame, 932 bytes spill stores, 2244 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 512 bytes stack frame, 372 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1940 bytes spill stores, 2960 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 408 bytes stack frame, 812 bytes spill stores, 2396 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 552 bytes stack frame, 1552 bytes spill stores, 2484 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 472 bytes stack frame, 276 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 336 bytes stack frame, 492 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 416 bytes stack frame, 796 bytes spill stores, 1640 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_f16_NVLS_TREE_SIMPLEv 584 bytes stack frame, 1608 bytes spill stores, 2352 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_NVLS_SIMPLEv 656 bytes stack frame, 2076 bytes spill stores, 3476 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 2072 bytes spill stores, 3064 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 428 bytes spill stores, 1196 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 544 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 424 bytes stack frame, 884 bytes spill stores, 1572 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 672 bytes stack frame, 248 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1124 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 544 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 424 bytes stack frame, 884 bytes spill stores, 1572 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 672 bytes stack frame, 248 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1124 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 544 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 424 bytes stack frame, 884 bytes spill stores, 1572 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 672 bytes stack frame, 248 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1124 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 168 bytes stack frame, 304 bytes spill stores, 508 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 448 bytes stack frame, 912 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 704 bytes stack frame, 288 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1320 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 400 bytes spill stores, 736 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 288 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 552 bytes stack frame, 1448 bytes spill stores, 2320 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 424 bytes stack frame, 564 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1608 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 884 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 128 bytes stack frame, 252 bytes spill stores, 448 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 520 bytes stack frame, 1296 bytes spill stores, 2264 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 336 bytes stack frame, 492 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 376 bytes stack frame, 480 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_Sum_bf16_NVLS_TREE_SIMPLEv 608 bytes stack frame, 1548 bytes spill stores, 2268 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_NVLS_SIMPLEv 640 bytes stack frame, 2020 bytes spill stores, 3480 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1748 bytes spill stores, 2580 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 340 bytes spill stores, 464 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 672 bytes stack frame, 1688 bytes spill stores, 2640 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 808 bytes stack frame, 400 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 400 bytes stack frame, 572 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1676 bytes spill stores, 2316 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 380 bytes spill stores, 700 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 672 bytes stack frame, 1688 bytes spill stores, 2640 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 808 bytes stack frame, 400 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 400 bytes stack frame, 572 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1676 bytes spill stores, 2316 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 380 bytes spill stores, 700 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 672 bytes stack frame, 1688 bytes spill stores, 2640 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 808 bytes stack frame, 400 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 400 bytes stack frame, 572 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1676 bytes spill stores, 2316 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 380 bytes spill stores, 700 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 640 bytes stack frame, 1804 bytes spill stores, 2724 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 840 bytes stack frame, 456 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 376 bytes stack frame, 532 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 424 bytes stack frame, 560 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1668 bytes spill stores, 2328 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 440 bytes spill stores, 792 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 1144 bytes stack frame, 3800 bytes spill stores, 5052 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 848 bytes stack frame, 464 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 376 bytes stack frame, 536 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 600 bytes stack frame, 1128 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 504 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 864 bytes stack frame, 3676 bytes spill stores, 5924 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1340 bytes spill stores, 3024 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 1112 bytes stack frame, 3880 bytes spill stores, 4992 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 784 bytes stack frame, 376 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 536 bytes stack frame, 1172 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 464 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 3992 bytes spill stores, 6572 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 464 bytes stack frame, 1392 bytes spill stores, 2736 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 656 bytes stack frame, 1696 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 808 bytes stack frame, 400 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 400 bytes stack frame, 564 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1672 bytes spill stores, 2296 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 376 bytes spill stores, 624 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 656 bytes stack frame, 1696 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 808 bytes stack frame, 400 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 400 bytes stack frame, 564 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1672 bytes spill stores, 2296 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 376 bytes spill stores, 624 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 656 bytes stack frame, 1696 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 808 bytes stack frame, 400 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 336 bytes stack frame, 540 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 400 bytes stack frame, 564 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 448 bytes stack frame, 296 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1672 bytes spill stores, 2296 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 376 bytes spill stores, 624 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 632 bytes stack frame, 1784 bytes spill stores, 2704 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 840 bytes stack frame, 456 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 376 bytes stack frame, 532 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 424 bytes stack frame, 560 bytes spill stores, 1124 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 504 bytes stack frame, 360 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1648 bytes spill stores, 2316 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 440 bytes spill stores, 792 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 1144 bytes stack frame, 3796 bytes spill stores, 5036 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 848 bytes stack frame, 464 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 376 bytes stack frame, 536 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 600 bytes stack frame, 1128 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 504 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 864 bytes stack frame, 3608 bytes spill stores, 5872 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1372 bytes spill stores, 2832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 1120 bytes stack frame, 3884 bytes spill stores, 5028 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 784 bytes stack frame, 376 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 536 bytes stack frame, 1184 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 464 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 3988 bytes spill stores, 6568 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 480 bytes stack frame, 1388 bytes spill stores, 2732 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 424 bytes stack frame, 864 bytes spill stores, 1540 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 672 bytes stack frame, 256 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 328 bytes stack frame, 404 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 448 bytes stack frame, 292 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1332 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 644 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 424 bytes stack frame, 864 bytes spill stores, 1540 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 672 bytes stack frame, 256 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 328 bytes stack frame, 404 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 448 bytes stack frame, 292 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1332 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 644 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 424 bytes stack frame, 864 bytes spill stores, 1540 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 672 bytes stack frame, 256 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 328 bytes stack frame, 404 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 448 bytes stack frame, 292 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1332 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 644 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 448 bytes stack frame, 940 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 704 bytes stack frame, 292 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 352 bytes stack frame, 420 bytes spill stores, 940 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1428 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 400 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 592 bytes stack frame, 1608 bytes spill stores, 2552 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 360 bytes stack frame, 520 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 456 bytes stack frame, 808 bytes spill stores, 1772 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 536 bytes stack frame, 416 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1712 bytes spill stores, 2704 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 1228 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 552 bytes stack frame, 1496 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 480 bytes stack frame, 288 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 376 bytes stack frame, 484 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 464 bytes stack frame, 320 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1732 bytes spill stores, 2580 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 340 bytes spill stores, 464 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 424 bytes stack frame, 844 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 664 bytes stack frame, 240 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 456 bytes stack frame, 304 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1204 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 424 bytes stack frame, 844 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 664 bytes stack frame, 240 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 456 bytes stack frame, 304 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1204 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 424 bytes stack frame, 844 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 664 bytes stack frame, 240 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 328 bytes stack frame, 528 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 456 bytes stack frame, 304 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1204 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 640 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 448 bytes stack frame, 904 bytes spill stores, 1656 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 704 bytes stack frame, 292 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 496 bytes stack frame, 352 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1340 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 400 bytes spill stores, 736 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 528 bytes stack frame, 1500 bytes spill stores, 2448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 360 bytes stack frame, 516 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 480 bytes stack frame, 932 bytes spill stores, 2244 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 512 bytes stack frame, 372 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1940 bytes spill stores, 2960 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 408 bytes stack frame, 812 bytes spill stores, 2396 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 552 bytes stack frame, 1552 bytes spill stores, 2484 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 472 bytes stack frame, 276 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 336 bytes stack frame, 492 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 416 bytes stack frame, 796 bytes spill stores, 1640 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_MinMax_bf16_NVLS_TREE_SIMPLEv 584 bytes stack frame, 1608 bytes spill stores, 2352 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_NVLS_SIMPLEv 656 bytes stack frame, 2076 bytes spill stores, 3476 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 2072 bytes spill stores, 3064 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 428 bytes spill stores, 1196 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' /usr/local/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.20.5-1/build/include -I../include --compiler-options "-fPIC -fvisibility=hidden" -dlink /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/common.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/onerank.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_gather.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_i32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_i64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_prod_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sum_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/all_reduce_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/broadcast.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_prod_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_i32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_i64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_prod_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sum_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_f16.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_f32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_f64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sum_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/reduce_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/sendrecv.cu.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/host_table.cc.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/genobj/device_table.cu.o -o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/device_glue.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Archiving libnccl_static.a > /builddir/build/BUILD/nccl-2.20.5-1/build/lib/libnccl_static.a mkdir -p /builddir/build/BUILD/nccl-2.20.5-1/build/lib ar cr /builddir/build/BUILD/nccl-2.20.5-1/build/lib/libnccl_static.a /builddir/build/BUILD/nccl-2.20.5-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/collectives.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/group.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/net.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/register.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/tuner.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enhcompat.o $(cat /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/manifest) make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.20.5-1/src' Linking libnccl.so.2.20.5 > /builddir/build/BUILD/nccl-2.20.5-1/build/lib/libnccl.so.2.20.5 mkdir -p /builddir/build/BUILD/nccl-2.20.5-1/build/lib g++ -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -shared -Wl,--no-as-needed -Wl,-soname,libnccl.so.2 -o /builddir/build/BUILD/nccl-2.20.5-1/build/lib/libnccl.so.2.20.5 /builddir/build/BUILD/nccl-2.20.5-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/collectives.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/group.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/net.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/register.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/tuner.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.20.5-1/build/obj/enhcompat.o $(cat /builddir/build/BUILD/nccl-2.20.5-1/build/obj/device/manifest) -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 -L/usr/local/cuda/lib64 -lcudart_static -lpthread -lrt -ldl ln -sf libnccl.so.2 /builddir/build/BUILD/nccl-2.20.5-1/build/lib/libnccl.so ln -sf libnccl.so.2.20.5 /builddir/build/BUILD/nccl-2.20.5-1/build/lib/libnccl.so.2 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.20.5-1/src' + RPM_EC=0 ++ jobs -p + exit 0 Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.5apFI6 + umask 022 + cd /builddir/build/BUILD + '[' /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64 '!=' / ']' + rm -rf /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64 ++ dirname /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64 + mkdir -p /builddir/build/BUILDROOT + mkdir /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64 + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.20.5-1 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/include + mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/lib64 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/lib64/pkgconfig + cp -d build/lib/libnccl.so build/lib/libnccl.so.2 build/lib/libnccl.so.2.20.5 build/lib/libnccl_static.a /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/lib64 + cp build/include/nccl.h build/include/nccl_net.h /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/include + cp build/lib/pkgconfig/nccl.pc /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/lib64/pkgconfig/ + /usr/bin/find-debuginfo -j80 --strict-build-id -m -i --build-id-seed 2.20.5-1.cuda12.1.an23 --unique-debug-suffix -2.20.5-1.cuda12.1.an23.aarch64 --unique-debug-src-base libnccl-2.20.5-1.cuda12.1.an23.aarch64 --run-dwz --dwz-low-mem-die-limit 10000000 --dwz-max-die-limit 50000000 -S debugsourcefiles.list /builddir/build/BUILD/nccl-2.20.5-1 extracting debug info from /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/lib64/libnccl.so.2.20.5 original debug info size: 8124kB, size after compression: 6900kB /usr/bin/sepdebugcrcfix: Updated 1 CRC32s, 0 CRC32s did match. 3173 blocks + /usr/lib/rpm/check-buildroot + /usr/lib/rpm/anolis/brp-ldconfig + COMPRESS='zstd -f --rm -19 -T0' + COMPRESS_EXT=.zst + /usr/lib/rpm/brp-compress + /usr/lib/rpm/anolis/brp-strip-lto /usr/bin/strip + /usr/lib/rpm/brp-strip-static-archive /usr/bin/strip + /usr/lib/rpm/check-rpaths + /usr/lib/rpm/brp-remove-la-files + /usr/lib/rpm/anolis/clean_perl + /usr/lib/rpm/anolis/check_elf_files + /usr/lib/rpm/anolis/brp-mangle-shebangs + /usr/lib/rpm/anolis/remove-info-dir + /usr/lib/rpm/anolis/check-desktop-files + /usr/lib/rpm/anolis/brp-python-bytecompile '' 1 0 + /usr/lib/rpm/anolis/brp-python-hardlink Processing files: libnccl-2.20.5-1.cuda12.1.an23.aarch64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.jD6FvV + umask 022 + cd /builddir/build/BUILD + cd nccl-2.20.5-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl = 2.20.5-1.cuda12.1.an23 libnccl(aarch-64) = 2.20.5-1.cuda12.1.an23 libnccl.so.2()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: ld-linux-aarch64.so.1()(64bit) ld-linux-aarch64.so.1(GLIBC_2.17)(64bit) libc.so.6()(64bit) libc.so.6(GLIBC_2.17)(64bit) libc.so.6(GLIBC_2.33)(64bit) libc.so.6(GLIBC_2.34)(64bit) libgcc_s.so.1()(64bit) libgcc_s.so.1(GCC_3.0)(64bit) libstdc++.so.6()(64bit) libstdc++.so.6(CXXABI_1.3)(64bit) libstdc++.so.6(GLIBCXX_3.4)(64bit) libstdc++.so.6(GLIBCXX_3.4.11)(64bit) libstdc++.so.6(GLIBCXX_3.4.19)(64bit) rtld(GNU_HASH) Processing files: libnccl-devel-2.20.5-1.cuda12.1.an23.aarch64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.r4sECc + umask 022 + cd /builddir/build/BUILD + cd nccl-2.20.5-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl-devel + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl-devel + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl-devel + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-devel = 2.20.5-1.cuda12.1.an23 libnccl-devel(aarch-64) = 2.20.5-1.cuda12.1.an23 pkgconfig(nccl) = 2.20.5 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: /usr/bin/pkg-config libnccl.so.2()(64bit) Processing files: libnccl-static-2.20.5-1.cuda12.1.an23.aarch64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.c4b5d3 + umask 022 + cd /builddir/build/BUILD + cd nccl-2.20.5-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl-static + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl-static + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64/usr/share/licenses/libnccl-static + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-static = 2.20.5-1.cuda12.1.an23 libnccl-static(aarch-64) = 2.20.5-1.cuda12.1.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-debugsource-2.20.5-1.cuda12.1.an23.aarch64 Provides: libnccl-debugsource = 2.20.5-1.cuda12.1.an23 libnccl-debugsource(aarch-64) = 2.20.5-1.cuda12.1.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-debuginfo-2.20.5-1.cuda12.1.an23.aarch64 Provides: debuginfo(build-id) = b95ec84a58296885cbc5c29dab4a548779b7d8a5 libnccl-debuginfo = 2.20.5-1.cuda12.1.an23 libnccl-debuginfo(aarch-64) = 2.20.5-1.cuda12.1.an23 libnccl.so.2.20.5-2.20.5-1.cuda12.1.an23.aarch64.debug()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Recommends: libnccl-debugsource(aarch-64) = 2.20.5-1.cuda12.1.an23 Checking for unpackaged file(s): /usr/lib/rpm/check-files /builddir/build/BUILDROOT/libnccl-2.20.5-1.cuda12.1.an23.aarch64 Wrote: /builddir/build/RPMS/libnccl-devel-2.20.5-1.cuda12.1.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-debugsource-2.20.5-1.cuda12.1.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-debuginfo-2.20.5-1.cuda12.1.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-static-2.20.5-1.cuda12.1.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-2.20.5-1.cuda12.1.an23.aarch64.rpm Child return code was: 0