Mock Version: 3.5 Mock Version: 3.5 Mock Version: 3.5 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl-cuda-11.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-357027-71129/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl-cuda-11.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1709596800 Wrote: /builddir/build/SRPMS/libnccl-cuda-11-2.19.3-2.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl-cuda-11.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-357027-71129/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/libnccl-cuda-11.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1709596800 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.G0xYVy + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf nccl-2.19.3-1 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/nccl-2.19.3-1.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd nccl-2.19.3-1 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + /usr/lib/rpm/rpmuncompress /builddir/build/SOURCES/1000-fix-lib-path-in-nccl.pc.patch + /usr/bin/patch -p1 -s --fuzz=0 --no-backup-if-mismatch -f + /usr/lib/rpm/rpmuncompress /builddir/build/SOURCES/1001-add-allow-unsupported-compiler-to-compat-gcc12.patch + /usr/bin/patch -p1 -s --fuzz=0 --no-backup-if-mismatch -f + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.GmOR83 + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.19.3-1 + export LD_LIBRARY_PATH=/usr/local/cuda-11-8/lib64 + LD_LIBRARY_PATH=/usr/local/cuda-11-8/lib64 + export 'CFLAGS=usr/local/cuda-11-8/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + CFLAGS='usr/local/cuda-11-8/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export PREFIX=/usr + PREFIX=/usr + /usr/bin/make -O -j80 V=1 VERBOSE=1 /usr/bin/make -C src build BUILDDIR=/builddir/build/BUILD/nccl-2.19.3-1/build make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Grabbing include/nccl_net.h > /builddir/build/BUILD/nccl-2.19.3-1/build/include/nccl_net.h mkdir -p /builddir/build/BUILD/nccl-2.19.3-1/build/include install -m 644 include/nccl_net.h /builddir/build/BUILD/nccl-2.19.3-1/build/include/nccl_net.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' mkdir -p /builddir/build/BUILD/nccl-2.19.3-1/build/include Generating nccl.h.in > /builddir/build/BUILD/nccl-2.19.3-1/build/include/nccl.h sed -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/19/g" \ -e "s/\${nccl:Patch}/3/g" \ -e "s/\${nccl:Suffix}//g" \ -e "s/\${nccl:Version}/21903/g" \ nccl.h.in > /builddir/build/BUILD/nccl-2.19.3-1/build/include/nccl.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' mkdir -p /builddir/build/BUILD/nccl-2.19.3-1/build/lib/pkgconfig Generating nccl.pc.in > /builddir/build/BUILD/nccl-2.19.3-1/build/lib/pkgconfig/nccl.pc sed -e 's|${nccl:Prefix}|\/usr|g' \ -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/19/g" \ -e "s/\${nccl:Patch}/3/g" \ nccl.pc.in > /builddir/build/BUILD/nccl-2.19.3-1/build/lib/pkgconfig/nccl.pc make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' /usr/bin/make -C ./device make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling enhcompat.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enhcompat.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enhcompat.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enhcompat.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enhcompat.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' (which python3 >/dev/null || \ (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \ printf "\n${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n${bar}\n\n" 1>&2; \ exit 1)) \ && ./generate.py /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/gensrc "" make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/trees.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/trees.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/trees.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/trees.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/trees.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/tuner.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/tuner.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/tuner.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/tuner.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/tuner.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/host_table.cc make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/param.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/param.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/param.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/param.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/param.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/cudawrap.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/cudawrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/cudawrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/cudawrap.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/cudawrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/host_table.cc make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' In file included from /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/gensrc/host_table.cc:1: ../include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': ../include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ ../include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': ../include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/ipcsocket.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ipcsocket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ipcsocket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ipcsocket.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ipcsocket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/strongstream.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/strongstream.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/strongstream.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/strongstream.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/strongstream.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/nvmlwrap.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/nvmlwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/nvmlwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/nvmlwrap.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/nvmlwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/socket.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/socket.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies src/device/common.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/device_table.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/reduce.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/sendrecv.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/reduce_scatter.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/all_gather.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/broadcast.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies build/obj/device/gensrc/all_reduce.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling init_nvtx.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init_nvtx.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init_nvtx.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init_nvtx.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init_nvtx.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from init_nvtx.cc:2: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] 10 | }; | ^ init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/profiler.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/profiler.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/profiler.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/profiler.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/profiler.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/proxy.h:10, from include/profiler.h:10, from misc/profiler.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/proxy.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ misc/profiler.cc: In function 'ncclResult_t ncclProfilingRecord(ncclProxyArgs*, int, int, int)': misc/profiler.cc:113:56: warning: unused parameter 'args' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~~~~~~~~~~~~~~~~~~~^~~~ misc/profiler.cc:113:66: warning: unused parameter 'sub' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~ misc/profiler.cc:113:75: warning: unused parameter 'step' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~ misc/profiler.cc:113:85: warning: unused parameter 'state' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/rings.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/rings.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/rings.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/rings.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/rings.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from graph/rings.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/rings.cc: In function 'ncclResult_t ncclBuildRings(int, int*, int, int, int*, int*)': graph/rings.cc:22:80: warning: unused parameter 'prev' [-Wunused-parameter] 22 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { | ~~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Dependencies src/device/onerank.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/argcheck.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/argcheck.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/argcheck.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/argcheck.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/argcheck.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from include/argcheck.h:10, from misc/argcheck.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/info.h:11, from include/argcheck.h:11: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/nvls.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/nvls.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/nvls.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/nvls.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/nvls.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/nvls.cc:9: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/nvls.cc: In function 'ncclResult_t ncclNvlsSetup(ncclComm*, ncclComm*)': transport/nvls.cc:805:45: warning: unused parameter 'comm' [-Wunused-parameter] 805 | ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:805:68: warning: unused parameter 'parent' [-Wunused-parameter] 805 | ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { | ~~~~~~~~~~~~~~~~~^~~~~~ transport/nvls.cc: In function 'ncclResult_t ncclNvlsFree(ncclComm*)': transport/nvls.cc:809:44: warning: unused parameter 'comm' [-Wunused-parameter] 809 | ncclResult_t ncclNvlsFree(struct ncclComm* comm) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t ncclNvlsGraphRegisterBuffer(ncclComm*, ncclKernelPlan*, const void*, void*, size_t, size_t, bool*, void**, void**)': transport/nvls.cc:813:59: warning: unused parameter 'comm' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:813:88: warning: unused parameter 'plan' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:813:106: warning: unused parameter 'sendbuff' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~~~~~~^~~~~~~~ transport/nvls.cc:813:122: warning: unused parameter 'recvbuff' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~^~~~~~~~ transport/nvls.cc:813:139: warning: unused parameter 'sendbuffSize' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~ transport/nvls.cc:813:160: warning: unused parameter 'recvbuffSize' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~ transport/nvls.cc:813:202: warning: unused parameter 'outRegBufSend' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~~ transport/nvls.cc:813:224: warning: unused parameter 'outRegBufRecv' [-Wunused-parameter] 813 | ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~~ transport/nvls.cc: In function 'ncclResult_t ncclNvlsLocalRegisterBuffer(ncclComm*, const void*, void*, size_t, size_t, bool*, void**, void**)': transport/nvls.cc:818:59: warning: unused parameter 'comm' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:818:77: warning: unused parameter 'sendbuff' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~~~~~~^~~~~~~~ transport/nvls.cc:818:93: warning: unused parameter 'recvbuff' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~^~~~~~~~ transport/nvls.cc:818:110: warning: unused parameter 'sendbuffSize' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~ transport/nvls.cc:818:131: warning: unused parameter 'recvbuffSize' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~ transport/nvls.cc:818:173: warning: unused parameter 'outRegBufSend' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~~ transport/nvls.cc:818:195: warning: unused parameter 'outRegBufRecv' [-Wunused-parameter] 818 | ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { | ~~~~~~~^~~~~~~~~~~~~ transport/nvls.cc: In function 'ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle*, CUdeviceptr, int, size_t)': transport/nvls.cc:823:64: warning: unused parameter 'mcHandler' [-Wunused-parameter] 823 | ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~ transport/nvls.cc:823:87: warning: unused parameter 'ptr' [-Wunused-parameter] 823 | ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { | ~~~~~~~~~~~~^~~ transport/nvls.cc:823:96: warning: unused parameter 'dev' [-Wunused-parameter] 823 | ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { | ~~~~^~~ transport/nvls.cc:823:108: warning: unused parameter 'size' [-Wunused-parameter] 823 | ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { | ~~~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/utils.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/utils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/utils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/utils.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/utils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from misc/utils.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/gdrwrap.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/gdrwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/gdrwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/gdrwrap.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/gdrwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from misc/gdrwrap.cc:10: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/shmutils.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/shmutils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/shmutils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/shmutils.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/shmutils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from misc/shmutils.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling debug.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/debug.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/debug.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c debug.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/debug.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from debug.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/ibvsymbols.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvsymbols.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvsymbols.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvsymbols.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvsymbols.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from misc/ibvsymbols.cc:67: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling misc/ibvwrap.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvwrap.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from include/ibvwrap.h:21, from misc/ibvwrap.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling net.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c net.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/net.h:12, from net.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ net.cc: In function 'ncclResult_t ncclNetCheckDeviceVersion(ncclComm*, ncclNet_t*, int)': net.cc:292:57: warning: unused parameter 'comm' [-Wunused-parameter] 292 | ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { | ~~~~~~~~~~~~~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclConnect; size_t = long unsigned int]': transport.cc:270:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTransportCollNetSetup(ncclComm*, ncclTopoGraph*, ncclChannel*, int, int, int, int)::; size_t = long unsigned int]': transport.cc:273:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling channel.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/channel.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/channel.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c channel.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/channel.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/channel.h:9, from channel.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclChannelPeer; size_t = long unsigned int]': channel.cc:29:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclDevChannelPeer*; size_t = long unsigned int]': channel.cc:45:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling group.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/group.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/group.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c group.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/group.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/group.h:11, from group.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPreconnectJob; size_t = long unsigned int]': group.cc:280:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/shm.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/shm.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/shm.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/shm.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/shm.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/shm.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/shm.cc: In function 'ncclResult_t shmCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/shm.cc:50:96: warning: unused parameter 'graph' [-Wunused-parameter] 50 | static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc: In function 'ncclResult_t shmSendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:76:79: warning: unused parameter 'graph' [-Wunused-parameter] 76 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:76:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 76 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:99:79: warning: unused parameter 'graph' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:99:107: warning: unused parameter 'myInfo' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~ transport/shm.cc:99:136: warning: unused parameter 'peerInfo' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/shm.cc:99:211: warning: unused parameter 'channelId' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc:99:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 161 | struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; | ^ transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:135:96: warning: unused parameter 'nranks' [-Wunused-parameter] 135 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/shm.cc:135:108: warning: unused parameter 'rank' [-Wunused-parameter] 135 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:195:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 195 | struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; | ^ transport/shm.cc:195:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:195:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:174:96: warning: unused parameter 'nranks' [-Wunused-parameter] 174 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/shm.cc:174:108: warning: unused parameter 'rank' [-Wunused-parameter] 174 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:227:179: warning: unused parameter 'done' [-Wunused-parameter] 227 | static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:245:179: warning: unused parameter 'done' [-Wunused-parameter] 245 | static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:263:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 263 | static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:278:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 278 | static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmSendResources; size_t = long unsigned int]': transport/shm.cc:78:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmRecvResources; size_t = long unsigned int]': transport/shm.cc:101:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmProxyInfo; size_t = long unsigned int]': transport/shm.cc:229:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/net_socket.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_socket.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/net_socket.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketInit(ncclDebugLogger_t)': transport/net_socket.cc:38:50: warning: unused parameter 'logFunction' [-Wunused-parameter] 38 | ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketRegMr(void*, void*, int, int, void**)': transport/net_socket.cc:537:39: warning: unused parameter 'comm' [-Wunused-parameter] 537 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:537:51: warning: unused parameter 'data' [-Wunused-parameter] 537 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:537:61: warning: unused parameter 'size' [-Wunused-parameter] 537 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~^~~~ transport/net_socket.cc:537:84: warning: unused parameter 'mhandle' [-Wunused-parameter] 537 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketDeregMr(void*, void*)': transport/net_socket.cc:540:41: warning: unused parameter 'comm' [-Wunused-parameter] 540 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~ transport/net_socket.cc:540:53: warning: unused parameter 'mhandle' [-Wunused-parameter] 540 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIsend(void*, void*, int, int, void*, void**)': transport/net_socket.cc:542:75: warning: unused parameter 'tag' [-Wunused-parameter] 542 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~^~~ transport/net_socket.cc:542:86: warning: unused parameter 'mhandle' [-Wunused-parameter] 542 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIrecv(void*, int, void**, int*, int*, void**, void**)': transport/net_socket.cc:548:86: warning: unused parameter 'tags' [-Wunused-parameter] 548 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~^~~~ transport/net_socket.cc:548:99: warning: unused parameter 'mhandles' [-Wunused-parameter] 548 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIflush(void*, int, void**, int*, void**, void**)': transport/net_socket.cc:555:40: warning: unused parameter 'recvComm' [-Wunused-parameter] 555 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~^~~~~~~~ transport/net_socket.cc:555:54: warning: unused parameter 'n' [-Wunused-parameter] 555 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~^ transport/net_socket.cc:555:64: warning: unused parameter 'data' [-Wunused-parameter] 555 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~ transport/net_socket.cc:555:75: warning: unused parameter 'sizes' [-Wunused-parameter] 555 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~^~~~~ transport/net_socket.cc:555:89: warning: unused parameter 'mhandles' [-Wunused-parameter] 555 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc:555:106: warning: unused parameter 'request' [-Wunused-parameter] 555 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketListenComm; size_t = long unsigned int]': transport/net_socket.cc:293:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketComm; size_t = long unsigned int]': transport/net_socket.cc:322:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': transport/net_socket.cc:372:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketTask; size_t = long unsigned int]': transport/net_socket.cc:434:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/connect.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/connect.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/connect.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/connect.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/connect.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from graph/connect.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/connect.cc: In function 'ncclResult_t connectTrees(ncclComm*, int*, int*, int*, int*)': graph/connect.cc:130:119: warning: unused parameter 'treePatterns' [-Wunused-parameter] 130 | static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { | ~~~~~^~~~~~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/connect.cc:174:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/tuning.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/tuning.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/tuning.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/tuning.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/tuning.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from graph/tuning.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from graph/tuning.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling collectives.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/collectives.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/collectives.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/collectives.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from include/argcheck.h:10, from collectives.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/info.h:11, from include/argcheck.h:11: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ collectives.cc: In function 'ncclResult_t ncclAllGather(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t)': collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 19 | }; | ^ collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:19:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 25 | ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; | ^ collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:25:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclAllReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 42 | }; | ^ collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:42:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 48 | ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; | ^ collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:48:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 63 | }; | ^ collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:63:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 69 | BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; | ^ collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:69:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, cudaStream_t)': collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 94 | }; | ^ collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:94:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 100 | REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; | ^ collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:100:42: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 116 | }; | ^ collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:116:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 122 | REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; | ^ collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:122:56: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc: At global scope: collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 133 | }; | ^ collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives.cc:133:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 144 | 1, 1 }; | ^ collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:144:10: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 161 | 1, 1 }; | ^ collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives.cc:161:10: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling bootstrap.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/bootstrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/bootstrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c bootstrap.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/bootstrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from bootstrap.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/transport.h:10, from include/comm.h:10, from include/bootstrap.h:11, from bootstrap.cc:10: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ bootstrap.cc: In function 'ncclResult_t bootstrapCreateRoot(ncclBootstrapHandle*, bool)': bootstrap.cc:173:75: warning: unused parameter 'idFromEnv' [-Wunused-parameter] 173 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) { | ~~~~~^~~~~~~~~ bootstrap.cc: In function 'ncclResult_t bootstrapInit(ncclBootstrapHandle*, ncclComm*)': bootstrap.cc:239:29: warning: missing initializer for member 'extInfo::nranks' [-Wmissing-field-initializers] 239 | struct extInfo info = { 0 }; | ^ bootstrap.cc:239:29: warning: missing initializer for member 'extInfo::extAddressListenRoot' [-Wmissing-field-initializers] bootstrap.cc:239:29: warning: missing initializer for member 'extInfo::extAddressListen' [-Wmissing-field-initializers] In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocketAddress; size_t = long unsigned int]': bootstrap.cc:111:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': bootstrap.cc:178:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapRootArgs; size_t = long unsigned int]': bootstrap.cc:183:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapState; size_t = long unsigned int]': bootstrap.cc:241:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unexConn; size_t = long unsigned int]': bootstrap.cc:482:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/p2p.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/p2p.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/p2p.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/p2p.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/p2p.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/p2p.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/p2p.cc: In function 'ncclResult_t p2pCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/p2p.cc:103:89: warning: unused parameter 'graph' [-Wunused-parameter] 103 | ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/p2p.cc: In function 'ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc*)': transport/p2p.cc:221:54: warning: unused parameter 'ipcDesc' [-Wunused-parameter] 221 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { | ~~~~~~~~~~~~~^~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pMap(ncclComm*, ncclProxyConnector*, ncclPeerInfo*, ncclPeerInfo*, ncclP2pBuff*, void**, void**)': transport/p2p.cc:290:78: warning: unused parameter 'proxyConn' [-Wunused-parameter] 290 | static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/p2p.cc:400:71: warning: unused parameter 'channelId' [-Wunused-parameter] 400 | struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:450:96: warning: unused parameter 'nranks' [-Wunused-parameter] 450 | static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:488:89: warning: unused parameter 'nranks' [-Wunused-parameter] 488 | ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:607:102: warning: unused parameter 'proxyState' [-Wunused-parameter] 607 | static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:627:104: warning: unused parameter 'proxyState' [-Wunused-parameter] 627 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc:627:150: warning: unused parameter 'respBuff' [-Wunused-parameter] 627 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/p2p.cc:627:164: warning: unused parameter 'respSize' [-Wunused-parameter] 627 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/p2p.cc:627:179: warning: unused parameter 'done' [-Wunused-parameter] 627 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:641:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 641 | static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:673:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 673 | static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pResources; size_t = long unsigned int]': transport/p2p.cc:337:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pShmProxyInfo; size_t = long unsigned int]': transport/p2p.cc:569:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pCuMemProxyInfo; size_t = long unsigned int]': transport/p2p.cc:596:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/paths.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/paths.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/paths.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/paths.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/paths.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from graph/paths.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/paths.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoLinkList; size_t = long unsigned int]': graph/paths.cc:36:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/paths.cc:622:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/paths.cc:623:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/coll_net.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/coll_net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/coll_net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/coll_net.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/coll_net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/coll_net.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/coll_net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/coll_net.cc:134:65: warning: unused parameter 'topo' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc:134:93: warning: unused parameter 'graph' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:134:121: warning: unused parameter 'info1' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:134:149: warning: unused parameter 'info2' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 151 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:150:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 150 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc:150:163: warning: unused parameter 'connectInfo' [-Wunused-parameter] 150 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 171 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:170:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 170 | static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t sendFree(ncclConnector*)': transport/coll_net.cc:288:52: warning: unused parameter 'send' [-Wunused-parameter] 288 | static ncclResult_t sendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvFree(ncclConnector*)': transport/coll_net.cc:292:52: warning: unused parameter 'recv' [-Wunused-parameter] 292 | static ncclResult_t recvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:296:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 296 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/coll_net.cc:296:159: warning: unused parameter 'respSize' [-Wunused-parameter] 296 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/coll_net.cc:296:174: warning: unused parameter 'done' [-Wunused-parameter] 296 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:404:174: warning: unused parameter 'done' [-Wunused-parameter] 404 | static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:429:176: warning: unused parameter 'done' [-Wunused-parameter] 429 | static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:503:176: warning: unused parameter 'done' [-Wunused-parameter] 503 | static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendResources; size_t = long unsigned int]': transport/coll_net.cc:301:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sharedResources; size_t = long unsigned int]': transport/coll_net.cc:324:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char (*)[128]; size_t = long unsigned int]': transport/coll_net.cc:337:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvResources; size_t = long unsigned int]': transport/coll_net.cc:409:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/coll_net.cc:462:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/net_ib.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_ib.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_ib.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_ib.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_ib.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from transport/net_ib.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/transport.h:10, from include/comm.h:10, from include/net.h:12, from transport/net_ib.cc:10: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/net_ib.cc: In function 'ncclResult_t ncclIbInit(ncclDebugLogger_t)': transport/net_ib.cc:159:43: warning: unused parameter 'logFunction' [-Wunused-parameter] 159 | ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_ib.cc: In function 'ncclResult_t ncclIbGdrSupport(int)': transport/net_ib.cc:281:35: warning: unused parameter 'ibDev' [-Wunused-parameter] 281 | ncclResult_t ncclIbGdrSupport(int ibDev) { | ~~~~^~~~~ transport/net_ib.cc: In function 'ncclResult_t ncclIbRegMrDmaBuf(void*, void*, size_t, int, uint64_t, int, void**)': transport/net_ib.cc:917:73: warning: unused parameter 'type' [-Wunused-parameter] 917 | ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { | ~~~~^~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclIbListenComm; size_t = long unsigned int]': transport/net_ib.cc:604:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling proxy.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/proxy.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/proxy.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c proxy.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/proxy.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from proxy.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ proxy.cc: In function 'void ncclDumpProxyState(int)': proxy.cc:800:29: warning: unused parameter 'signal' [-Wunused-parameter] 800 | void ncclDumpProxyState(int signal) { | ~~~~^~~~~~ proxy.cc: In function 'ncclResult_t ncclProxyConnect(ncclComm*, int, int, int, ncclProxyConnector*)': proxy.cc:1066:35: warning: missing initializer for member 'ncclProxyInitReq::send' [-Wmissing-field-initializers] 1066 | struct ncclProxyInitReq req = {0}; | ^ proxy.cc:1066:35: warning: missing initializer for member 'ncclProxyInitReq::tpLocalRank' [-Wmissing-field-initializers] proxy.cc:1066:35: warning: missing initializer for member 'ncclProxyInitReq::tpRank' [-Wmissing-field-initializers] proxy.cc:1066:35: warning: missing initializer for member 'ncclProxyInitReq::sameProcess' [-Wmissing-field-initializers] proxy.cc:1073:37: warning: missing initializer for member 'ncclProxyInitResp::devShmPath' [-Wmissing-field-initializers] 1073 | struct ncclProxyInitResp resp = {0}; | ^ proxy.cc: In function 'ncclResult_t ncclProxyClientGetFdBlocking(ncclComm*, ncclProxyConnector*, void*, int*)': proxy.cc:1099:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1099 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1099:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t ncclPollProxyResponse(ncclComm*, ncclProxyConnector*, void*, void*)': proxy.cc:1169:41: warning: missing initializer for member 'ncclProxyRpcResponseHeader::res' [-Wmissing-field-initializers] 1169 | ncclProxyRpcResponseHeader resp = {0}; | ^ proxy.cc:1169:41: warning: missing initializer for member 'ncclProxyRpcResponseHeader::respSize' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t proxyGetFd(ncclProxyLocalPeer*, void*, ncclProxyState*, uint64_t)': proxy.cc:1316:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1316 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1316:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclExpectedProxyResponse; size_t = long unsigned int]': proxy.cc:92:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPool; size_t = long unsigned int]': proxy.cc:206:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyConnection; size_t = long unsigned int]': proxy.cc:972:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': proxy.cc:1050:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyOps; size_t = long unsigned int]': proxy.cc:1051:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = void*; size_t = long unsigned int]': proxy.cc:1052:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyAsyncOp; size_t = long unsigned int]': proxy.cc:1395:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': proxy.cc:1403:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyState; size_t = long unsigned int]': proxy.cc:1603:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/topo.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/topo.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/topo.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/topo.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/topo.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from graph/topo.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/topo.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/topo.cc: In function 'ncclResult_t pciPathToInt64(char*, int, int, int64_t*)': graph/topo.cc:31:57: warning: unused parameter 'minOffset' [-Wunused-parameter] 31 | ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { | ~~~~^~~~~~~~~ graph/topo.cc: In function 'ncclResult_t ncclTopoAddGpu(ncclXmlNode*, ncclTopoSystem*, ncclTopoNode*)': graph/topo.cc:366:80: warning: unused parameter 'system' [-Wunused-parameter] 366 | ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/topo.cc:191:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoSystem; size_t = long unsigned int]': graph/topo.cc:577:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclXml; size_t = long unsigned int]': graph/topo.cc:631:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/topo.cc:720:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling transport/net.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/net.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/net.cc:144:93: warning: unused parameter 'graph' [-Wunused-parameter] 144 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:174:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 174 | struct setupReq req = { 0 }; | ^ transport/net.cc:174:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:174:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:211:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 211 | struct setupReq req = { 0 }; | ^ transport/net.cc:211:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:211:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t sendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:284:93: warning: unused parameter 'nranks' [-Wunused-parameter] 284 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/net.cc:284:105: warning: unused parameter 'rank' [-Wunused-parameter] 284 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t recvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:381:93: warning: unused parameter 'nranks' [-Wunused-parameter] 381 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/net.cc:381:105: warning: unused parameter 'rank' [-Wunused-parameter] 381 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/net.cc:554:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 554 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = connectMap; size_t = long unsigned int]': transport/net.cc:292:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer*; size_t = long unsigned int]': transport/net.cc:482:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer; size_t = long unsigned int]': transport/net.cc:486:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendNetResources; size_t = long unsigned int]': transport/net.cc:559:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvNetResources; size_t = long unsigned int]': transport/net.cc:592:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetDeviceHandle_v7_t; size_t = long unsigned int]': transport/net.cc:631:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedNetComms; size_t = long unsigned int]': transport/net.cc:662:9: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/net.cc:742:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/xml.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/xml.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/xml.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/xml.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/xml.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from graph/xml.cc:12: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/xml.cc: In function 'ncclResult_t ncclTopoGetXmlFromCpu(ncclXmlNode*, ncclXml*)': graph/xml.cc:374:81: warning: unused parameter 'xml' [-Wunused-parameter] 374 | ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { | ~~~~~~~~~~~~~~~~^~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling graph/search.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/search.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/search.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/search.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/search.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/core.h:39, from graph/search.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/search.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/search.cc: In function 'float getTotalBw(ncclTopoSystem*, ncclTopoNode*)': graph/search.cc:27:48: warning: unused parameter 'system' [-Wunused-parameter] 27 | static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/search.cc:401:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclXml; size_t = long unsigned int]': graph/search.cc:870:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling enqueue.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enqueue.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enqueue.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enqueue.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enqueue.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from enqueue.cc:7: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ enqueue.cc: In function 'ncclResult_t ncclInitKernelsForDevice(int, size_t*)': enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::constSizeBytes' [-Wmissing-field-initializers] 42 | cudaFuncAttributes attr = {0}; | ^ enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::localSizeBytes' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::maxThreadsPerBlock' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::numRegs' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::ptxVersion' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::binaryVersion' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::cacheModeCA' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::maxDynamicSharedSizeBytes' [-Wmissing-field-initializers] enqueue.cc:42:35: warning: missing initializer for member 'cudaFuncAttributes::preferredShmemCarveout' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t addP2pToPlan(ncclComm*, ncclKernelPlan*, int*, bool, int, int, void*, size_t, bool)': enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 298 | }; | ^ enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::sendbuffSize' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::recvbuffSize' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] enqueue.cc:298:3: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::proto' [-Wmissing-field-initializers] 312 | struct ncclWorkElemP2p elem = {0}; | ^ enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::p2pType' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::nWarps' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::warpStart' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::ngroups' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::buffHi32' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::buffLo32' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::countHi32' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::countLo32' [-Wmissing-field-initializers] enqueue.cc:312:35: warning: missing initializer for member 'ncclWorkElemP2p::chunkSize' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t ncclLaunchKernel(ncclComm*, ncclKernelPlan*)': enqueue.cc:1050:41: warning: missing initializer for member 'cudaLaunchConfig_st::blockDim' [-Wmissing-field-initializers] 1050 | cudaLaunchConfig_t launchConfig = {0}; | ^ enqueue.cc:1050:41: warning: missing initializer for member 'cudaLaunchConfig_st::dynamicSmemBytes' [-Wmissing-field-initializers] enqueue.cc:1050:41: warning: missing initializer for member 'cudaLaunchConfig_st::stream' [-Wmissing-field-initializers] enqueue.cc:1050:41: warning: missing initializer for member 'cudaLaunchConfig_st::attrs' [-Wmissing-field-initializers] enqueue.cc:1050:41: warning: missing initializer for member 'cudaLaunchConfig_st::numAttrs' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t ncclLaunchPrepare(ncclComm*)': enqueue.cc:677:35: warning: 'fuseOk' may be used uninitialized [-Wmaybe-uninitialized] 677 | NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, fuseOk)); | ^ enqueue.cc:630:8: note: 'fuseOk' was declared here 630 | bool fuseOk; | ^~~~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Compiling init.cc > /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init.o` g++ -I. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init.cc -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/channel.h:9, from init.cc:8: include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:356:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 356 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:357:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 357 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:39, from include/info.h:13, from include/graph.h:113, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ init.cc: In function 'ncclResult_t commGetSplitInfo(ncclComm*, ncclComm*, int, int, int*, int*, int*)': init.cc:1309:55: warning: unused parameter 'comm' [-Wunused-parameter] 1309 | static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { | ~~~~~~~~~~~~~~~~~^~~~ init.cc: At global scope: init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 1665 | }; | ^ init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1665:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc: In function 'ncclResult_t ncclCommInitAll(ncclComm**, int, const int*)': init.cc:1692:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 1692 | }; | ^ init.cc:1692:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1692:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] init.cc:1692:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1692:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc: In function 'const char* ncclGetLastError(ncclComm_t)': init.cc:2116:41: warning: unused parameter 'comm' [-Wunused-parameter] 2116 | const char* ncclGetLastError(ncclComm_t comm) { | ~~~~~~~~~~~^~~~ init.cc: In function 'ncclResult_t ncclCommRegister(ncclComm_t, void*, size_t, void**)': init.cc:2173:48: warning: unused parameter 'comm' [-Wunused-parameter] 2173 | ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { | ~~~~~~~~~~~~~~~~~^~~~ init.cc:2173:60: warning: unused parameter 'buff' [-Wunused-parameter] 2173 | ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { | ~~~~~~^~~~ init.cc:2173:73: warning: unused parameter 'size' [-Wunused-parameter] 2173 | ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { | ~~~~~~~^~~~ init.cc:2173:86: warning: unused parameter 'handle' [-Wunused-parameter] 2173 | ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { | ~~~~~~~^~~~~~ init.cc: In function 'ncclResult_t ncclCommDeregister(ncclComm_t, void*)': init.cc:2224:50: warning: unused parameter 'comm' [-Wunused-parameter] 2224 | ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { | ~~~~~~~~~~~~~~~~~^~~~ init.cc:2224:62: warning: unused parameter 'handle' [-Wunused-parameter] 2224 | ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { | ~~~~~~^~~~~~ In file included from include/core.h:36: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': init.cc:374:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedResources; size_t = long unsigned int]': init.cc:382:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': init.cc:386:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = collNetTrySetup(ncclComm_t, ncclComm_t, ncclTopoGraph*)::collnetShareInfo; size_t = long unsigned int]': init.cc:601:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCollNetSharedRes; size_t = long unsigned int]': init.cc:657:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned char [4][10]; size_t = long unsigned int]': init.cc:687:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPeerInfo; size_t = long unsigned int]': init.cc:818:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = initTransportsRank(ncclComm*, ncclComm*)::allGatherInfo; size_t = long unsigned int]': init.cc:961:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNodeRanks; size_t = long unsigned int]': init.cc:995:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoRanks*; size_t = long unsigned int]': init.cc:1029:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclComm; size_t = long unsigned int]': init.cc:1626:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned int; size_t = long unsigned int]': init.cc:1628:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommInitRankAsyncJob; size_t = long unsigned int]': init.cc:1635:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommFinalizeAsyncJob; size_t = long unsigned int]': init.cc:1842:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = ncclWork; size_t = long unsigned int]' init.cc:436:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/device_table.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 4168 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling src/device/common.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z21ncclDevKernel_GenericP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling src/device/onerank.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, 353 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 58 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_35' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 60 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 65 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 60 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 60 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 385 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '__nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__64a49ce4_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__64a49ce4_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ../include/utils.h:41:13: warning: 'long int log2i(long int)' defined but not used [-Wunused-function] 41 | static long log2i(long n) { | ^~~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 520 bytes stack frame, 400 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 496 bytes stack frame, 336 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 560 bytes stack frame, 456 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 424 bytes stack frame, 496 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 328 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 528 bytes stack frame, 356 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 488 bytes stack frame, 320 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 560 bytes stack frame, 448 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 352 bytes stack frame, 376 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 240 bytes stack frame, 236 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 520 bytes stack frame, 352 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 536 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 592 bytes stack frame, 444 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 416 bytes stack frame, 576 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 592 bytes stack frame, 464 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 324 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 528 bytes stack frame, 408 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 520 bytes stack frame, 396 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 520 bytes stack frame, 396 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 576 bytes stack frame, 460 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 408 bytes stack frame, 464 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 296 bytes stack frame, 332 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 544 bytes stack frame, 400 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 408 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 264 bytes stack frame, 280 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 520 bytes stack frame, 360 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 584 bytes stack frame, 460 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 408 bytes stack frame, 588 bytes spill stores, 1008 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 576 bytes stack frame, 452 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 552 bytes stack frame, 416 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 512 bytes stack frame, 336 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 576 bytes stack frame, 456 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 576 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 248 bytes stack frame, 244 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 584 bytes stack frame, 444 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 400 bytes stack frame, 464 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 584 bytes stack frame, 460 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 296 bytes stack frame, 324 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 576 bytes stack frame, 460 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 408 bytes stack frame, 464 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 296 bytes stack frame, 332 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 544 bytes stack frame, 400 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 536 bytes stack frame, 356 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 592 bytes stack frame, 464 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 400 bytes stack frame, 616 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 600 bytes stack frame, 448 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 552 bytes stack frame, 384 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 528 bytes stack frame, 400 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 520 bytes stack frame, 408 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 520 bytes stack frame, 408 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 576 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 408 bytes stack frame, 468 bytes spill stores, 884 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 584 bytes stack frame, 476 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 332 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 528 bytes stack frame, 424 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 520 bytes stack frame, 408 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 520 bytes stack frame, 408 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 512 bytes stack frame, 368 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 576 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 408 bytes stack frame, 468 bytes spill stores, 884 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 584 bytes stack frame, 476 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 296 bytes stack frame, 328 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 528 bytes stack frame, 404 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 544 bytes stack frame, 416 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 528 bytes stack frame, 404 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 528 bytes stack frame, 400 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 264 bytes stack frame, 272 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 560 bytes stack frame, 428 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 496 bytes stack frame, 336 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 560 bytes stack frame, 456 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 416 bytes stack frame, 488 bytes spill stores, 904 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 560 bytes stack frame, 440 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 296 bytes stack frame, 332 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 536 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 576 bytes stack frame, 460 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 408 bytes stack frame, 464 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 296 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 544 bytes stack frame, 400 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 488 bytes stack frame, 320 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 560 bytes stack frame, 448 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 352 bytes stack frame, 376 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 240 bytes stack frame, 236 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 520 bytes stack frame, 352 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 528 bytes stack frame, 396 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 504 bytes stack frame, 336 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 1032 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 560 bytes stack frame, 432 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 296 bytes stack frame, 296 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 536 bytes stack frame, 376 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 264 bytes stack frame, 284 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 536 bytes stack frame, 376 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 256 bytes stack frame, 272 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 528 bytes stack frame, 352 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 576 bytes stack frame, 436 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 368 bytes stack frame, 424 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 288 bytes stack frame, 304 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 560 bytes stack frame, 408 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 528 bytes stack frame, 396 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 520 bytes stack frame, 376 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 520 bytes stack frame, 376 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 504 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 320 bytes stack frame, 332 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 560 bytes stack frame, 448 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 432 bytes stack frame, 620 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 320 bytes stack frame, 348 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 544 bytes stack frame, 416 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 520 bytes stack frame, 392 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 504 bytes stack frame, 352 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 576 bytes stack frame, 460 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 408 bytes stack frame, 464 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 296 bytes stack frame, 332 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 544 bytes stack frame, 400 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 528 bytes stack frame, 392 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 512 bytes stack frame, 352 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 576 bytes stack frame, 460 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 408 bytes stack frame, 580 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 296 bytes stack frame, 340 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 400 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 528 bytes stack frame, 392 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 512 bytes stack frame, 352 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 576 bytes stack frame, 460 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 416 bytes stack frame, 600 bytes spill stores, 1040 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 560 bytes stack frame, 444 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 400 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 288 bytes stack frame, 308 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 288 bytes stack frame, 308 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 288 bytes stack frame, 308 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 280 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 496 bytes stack frame, 348 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 328 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 568 bytes stack frame, 440 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 496 bytes stack frame, 812 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 368 bytes stack frame, 476 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 528 bytes stack frame, 368 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 528 bytes stack frame, 404 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 544 bytes stack frame, 416 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 528 bytes stack frame, 404 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 528 bytes stack frame, 400 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 264 bytes stack frame, 272 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 560 bytes stack frame, 428 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 536 bytes stack frame, 412 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 264 bytes stack frame, 288 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 552 bytes stack frame, 428 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 520 bytes stack frame, 384 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 400 bytes stack frame, 460 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 408 bytes stack frame, 452 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 632 bytes stack frame, 528 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 352 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 552 bytes stack frame, 420 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 288 bytes stack frame, 312 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 288 bytes stack frame, 312 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 288 bytes stack frame, 312 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 504 bytes stack frame, 368 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 280 bytes stack frame, 296 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 488 bytes stack frame, 324 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 328 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 496 bytes stack frame, 780 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 368 bytes stack frame, 480 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 520 bytes stack frame, 364 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 456 bytes stack frame, 672 bytes spill stores, 1144 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 408 bytes stack frame, 480 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 352 bytes stack frame, 424 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 352 bytes stack frame, 424 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 352 bytes stack frame, 424 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 344 bytes stack frame, 400 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 528 bytes stack frame, 400 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 392 bytes stack frame, 472 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 568 bytes stack frame, 440 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 712 bytes stack frame, 1132 bytes spill stores, 1684 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 568 bytes stack frame, 456 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 600 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 368 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 368 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 368 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 600 bytes stack frame, 448 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 472 bytes stack frame, 696 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 600 bytes stack frame, 436 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 408 bytes stack frame, 596 bytes spill stores, 1072 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 552 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 544 bytes stack frame, 420 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 552 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 528 bytes stack frame, 404 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 272 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 584 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 408 bytes stack frame, 644 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 608 bytes stack frame, 480 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 328 bytes stack frame, 416 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 568 bytes stack frame, 460 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 552 bytes stack frame, 400 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 552 bytes stack frame, 400 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 552 bytes stack frame, 400 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 456 bytes stack frame, 676 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 600 bytes stack frame, 468 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 348 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 368 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 600 bytes stack frame, 440 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 424 bytes stack frame, 580 bytes spill stores, 988 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 608 bytes stack frame, 452 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 368 bytes stack frame, 416 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 552 bytes stack frame, 360 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 552 bytes stack frame, 360 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 552 bytes stack frame, 360 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 616 bytes stack frame, 456 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 464 bytes stack frame, 704 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 600 bytes stack frame, 436 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 408 bytes stack frame, 592 bytes spill stores, 980 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 552 bytes stack frame, 396 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 464 bytes stack frame, 676 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 600 bytes stack frame, 468 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/sendrecv.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 136 bytes stack frame, 244 bytes spill stores, 268 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 272 bytes stack frame, 292 bytes spill stores, 336 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 136 bytes stack frame, 244 bytes spill stores, 268 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 272 bytes stack frame, 292 bytes spill stores, 336 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 136 bytes stack frame, 244 bytes spill stores, 268 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 272 bytes stack frame, 292 bytes spill stores, 336 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 120 bytes stack frame, 188 bytes spill stores, 228 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 28 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 264 bytes stack frame, 260 bytes spill stores, 292 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 136 bytes stack frame, 216 bytes spill stores, 284 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 304 bytes stack frame, 352 bytes spill stores, 428 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 264 bytes stack frame, 712 bytes spill stores, 1024 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 400 bytes stack frame, 784 bytes spill stores, 1056 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z22ncclDevKernel_SendRecvP11ncclDevCommmP8ncclWork 192 bytes stack frame, 608 bytes spill stores, 780 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 408 bytes stack frame, 792 bytes spill stores, 1028 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 464 bytes stack frame, 672 bytes spill stores, 1144 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 616 bytes stack frame, 504 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 392 bytes stack frame, 508 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 464 bytes stack frame, 700 bytes spill stores, 1172 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 400 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 552 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 552 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 552 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 312 bytes stack frame, 348 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 552 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 616 bytes stack frame, 456 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 488 bytes stack frame, 740 bytes spill stores, 1288 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 616 bytes stack frame, 460 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 432 bytes stack frame, 652 bytes spill stores, 1064 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 464 bytes stack frame, 692 bytes spill stores, 1168 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 400 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 544 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 544 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 544 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 536 bytes stack frame, 352 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 592 bytes stack frame, 436 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 456 bytes stack frame, 680 bytes spill stores, 1152 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 592 bytes stack frame, 428 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 392 bytes stack frame, 464 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 408 bytes stack frame, 452 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 632 bytes stack frame, 528 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 352 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 552 bytes stack frame, 420 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 288 bytes stack frame, 312 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 288 bytes stack frame, 312 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 288 bytes stack frame, 312 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 496 bytes stack frame, 348 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 328 bytes stack frame, 360 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 568 bytes stack frame, 464 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 512 bytes stack frame, 788 bytes spill stores, 1316 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 584 bytes stack frame, 460 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 368 bytes stack frame, 596 bytes spill stores, 964 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 544 bytes stack frame, 408 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 352 bytes stack frame, 404 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 352 bytes stack frame, 404 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 352 bytes stack frame, 404 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 384 bytes stack frame, 436 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 472 bytes stack frame, 720 bytes spill stores, 1180 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 616 bytes stack frame, 496 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 424 bytes stack frame, 568 bytes spill stores, 940 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 336 bytes stack frame, 388 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 336 bytes stack frame, 388 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 336 bytes stack frame, 388 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 368 bytes stack frame, 416 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 464 bytes stack frame, 700 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 616 bytes stack frame, 504 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 408 bytes stack frame, 540 bytes spill stores, 904 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 464 bytes stack frame, 676 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 600 bytes stack frame, 468 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 456 bytes stack frame, 676 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 592 bytes stack frame, 444 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 368 bytes stack frame, 408 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 496 bytes stack frame, 684 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 592 bytes stack frame, 444 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 400 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 232 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 272 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 584 bytes stack frame, 464 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 264 bytes stack frame, 272 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 560 bytes stack frame, 428 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 368 bytes stack frame, 408 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 472 bytes stack frame, 708 bytes spill stores, 1132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 592 bytes stack frame, 444 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 392 bytes stack frame, 536 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 552 bytes stack frame, 368 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 552 bytes stack frame, 368 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 552 bytes stack frame, 368 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 600 bytes stack frame, 448 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 440 bytes stack frame, 668 bytes spill stores, 1096 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 608 bytes stack frame, 452 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 384 bytes stack frame, 436 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 560 bytes stack frame, 416 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 632 bytes stack frame, 512 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 432 bytes stack frame, 568 bytes spill stores, 964 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 368 bytes stack frame, 420 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 74 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 232 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 272 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 584 bytes stack frame, 464 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 264 bytes stack frame, 272 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 560 bytes stack frame, 428 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 528 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 528 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 528 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 328 bytes stack frame, 388 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 392 bytes stack frame, 448 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 592 bytes stack frame, 448 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 536 bytes stack frame, 808 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 576 bytes stack frame, 428 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 496 bytes stack frame, 776 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 560 bytes stack frame, 416 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 632 bytes stack frame, 512 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 432 bytes stack frame, 568 bytes spill stores, 964 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 368 bytes stack frame, 420 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 328 bytes stack frame, 392 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 384 bytes stack frame, 436 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 592 bytes stack frame, 448 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 536 bytes stack frame, 824 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 584 bytes stack frame, 436 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 488 bytes stack frame, 808 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 560 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 560 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 560 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 392 bytes stack frame, 536 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 448 bytes stack frame, 684 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 600 bytes stack frame, 456 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 760 bytes stack frame, 1148 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 600 bytes stack frame, 452 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 504 bytes stack frame, 804 bytes spill stores, 1316 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 568 bytes stack frame, 416 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 560 bytes stack frame, 400 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 568 bytes stack frame, 416 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 552 bytes stack frame, 400 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 616 bytes stack frame, 480 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 456 bytes stack frame, 680 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 624 bytes stack frame, 484 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 376 bytes stack frame, 428 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 568 bytes stack frame, 412 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 320 bytes stack frame, 356 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 320 bytes stack frame, 356 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 320 bytes stack frame, 356 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 328 bytes stack frame, 348 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 376 bytes stack frame, 408 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 560 bytes stack frame, 840 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 408 bytes stack frame, 536 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 424 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 328 bytes stack frame, 348 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 368 bytes stack frame, 400 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 552 bytes stack frame, 840 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 592 bytes stack frame, 480 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 408 bytes stack frame, 536 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 576 bytes stack frame, 436 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 408 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 512 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 584 bytes stack frame, 464 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 408 bytes stack frame, 644 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 608 bytes stack frame, 480 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 328 bytes stack frame, 416 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 568 bytes stack frame, 460 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 616 bytes stack frame, 480 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 488 bytes stack frame, 776 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 624 bytes stack frame, 492 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 440 bytes stack frame, 700 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 328 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 600 bytes stack frame, 456 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 536 bytes stack frame, 848 bytes spill stores, 1492 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 512 bytes stack frame, 820 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 552 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 560 bytes stack frame, 420 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 560 bytes stack frame, 420 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 560 bytes stack frame, 420 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 296 bytes stack frame, 336 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 432 bytes stack frame, 568 bytes spill stores, 964 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 368 bytes stack frame, 420 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_gather.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 336 bytes stack frame, 388 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 376 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 336 bytes stack frame, 388 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 360 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 336 bytes stack frame, 388 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 360 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 352 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 384 bytes stack frame, 436 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 448 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 504 bytes stack frame, 956 bytes spill stores, 1880 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 440 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 448 bytes stack frame, 868 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 400 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 536 bytes stack frame, 424 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 536 bytes stack frame, 424 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 536 bytes stack frame, 424 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 232 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 272 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 576 bytes stack frame, 496 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 400 bytes stack frame, 460 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 264 bytes stack frame, 276 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 560 bytes stack frame, 436 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 560 bytes stack frame, 420 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 560 bytes stack frame, 420 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 560 bytes stack frame, 420 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 296 bytes stack frame, 336 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 360 bytes stack frame, 400 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 432 bytes stack frame, 568 bytes spill stores, 964 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 608 bytes stack frame, 472 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 368 bytes stack frame, 420 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 560 bytes stack frame, 412 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 424 bytes stack frame, 572 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 416 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 416 bytes stack frame, 496 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 624 bytes stack frame, 488 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 568 bytes stack frame, 824 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 616 bytes stack frame, 476 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 512 bytes stack frame, 784 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 512 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 472 bytes stack frame, 612 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 416 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 416 bytes stack frame, 496 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 616 bytes stack frame, 480 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 576 bytes stack frame, 824 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 616 bytes stack frame, 476 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 512 bytes stack frame, 784 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 568 bytes stack frame, 428 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 360 bytes stack frame, 404 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 608 bytes stack frame, 468 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 488 bytes stack frame, 776 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 624 bytes stack frame, 492 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 440 bytes stack frame, 700 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/broadcast.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 320 bytes stack frame, 344 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 376 bytes stack frame, 392 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 432 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 496 bytes stack frame, 748 bytes spill stores, 1380 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 424 bytes stack frame, 240 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 424 bytes stack frame, 696 bytes spill stores, 1276 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 400 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 560 bytes stack frame, 404 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 552 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 360 bytes stack frame, 408 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 616 bytes stack frame, 476 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 456 bytes stack frame, 680 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 624 bytes stack frame, 484 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 376 bytes stack frame, 428 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 568 bytes stack frame, 412 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 416 bytes stack frame, 776 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 544 bytes stack frame, 392 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 364 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1336 bytes spill stores, 1720 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 416 bytes stack frame, 776 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 364 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1336 bytes spill stores, 1720 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 416 bytes stack frame, 776 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 336 bytes stack frame, 536 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 364 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1336 bytes spill stores, 1720 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 400 bytes stack frame, 772 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 520 bytes stack frame, 328 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 312 bytes stack frame, 348 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 544 bytes stack frame, 356 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1360 bytes spill stores, 1904 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 432 bytes stack frame, 876 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 584 bytes stack frame, 420 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 360 bytes stack frame, 520 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 600 bytes stack frame, 428 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1372 bytes spill stores, 1920 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 592 bytes stack frame, 1404 bytes spill stores, 2056 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 584 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 376 bytes stack frame, 532 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 488 bytes stack frame, 840 bytes spill stores, 1904 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 600 bytes stack frame, 428 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 200 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1676 bytes spill stores, 2560 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 400 bytes stack frame, 516 bytes spill stores, 1508 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 600 bytes stack frame, 1396 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 528 bytes stack frame, 352 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 352 bytes stack frame, 516 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 424 bytes stack frame, 756 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 520 bytes stack frame, 356 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 688 bytes stack frame, 1620 bytes spill stores, 2356 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 404 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 416 bytes stack frame, 852 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 352 bytes stack frame, 564 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1244 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 416 bytes stack frame, 852 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 352 bytes stack frame, 564 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1244 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 416 bytes stack frame, 852 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 352 bytes stack frame, 564 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1244 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 408 bytes stack frame, 812 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 352 bytes stack frame, 504 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1228 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 440 bytes stack frame, 944 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 584 bytes stack frame, 428 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1352 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 592 bytes stack frame, 1380 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 600 bytes stack frame, 444 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 384 bytes stack frame, 552 bytes spill stores, 752 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 464 bytes stack frame, 820 bytes spill stores, 1652 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 576 bytes stack frame, 416 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 208 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1600 bytes spill stores, 2512 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 384 bytes stack frame, 500 bytes spill stores, 1336 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 592 bytes stack frame, 1360 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 496 bytes stack frame, 296 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 368 bytes stack frame, 552 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 416 bytes stack frame, 656 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 520 bytes stack frame, 356 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 688 bytes stack frame, 1600 bytes spill stores, 2340 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 372 bytes spill stores, 784 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 408 bytes stack frame, 820 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 336 bytes stack frame, 524 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1396 bytes spill stores, 1808 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 324 bytes spill stores, 660 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 408 bytes stack frame, 820 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 336 bytes stack frame, 524 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1396 bytes spill stores, 1808 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 324 bytes spill stores, 660 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 408 bytes stack frame, 820 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 544 bytes stack frame, 384 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 336 bytes stack frame, 524 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1396 bytes spill stores, 1808 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 324 bytes spill stores, 660 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 400 bytes stack frame, 752 bytes spill stores, 1060 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 520 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 336 bytes stack frame, 476 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 312 bytes stack frame, 348 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 544 bytes stack frame, 368 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1352 bytes spill stores, 1860 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 440 bytes stack frame, 924 bytes spill stores, 1384 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 600 bytes stack frame, 436 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 384 bytes stack frame, 436 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 600 bytes stack frame, 448 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1436 bytes spill stores, 2092 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 392 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 544 bytes stack frame, 1400 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 608 bytes stack frame, 444 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 528 bytes stack frame, 948 bytes spill stores, 2100 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 600 bytes stack frame, 452 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 760 bytes stack frame, 1884 bytes spill stores, 3068 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 416 bytes stack frame, 768 bytes spill stores, 2272 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 536 bytes stack frame, 1348 bytes spill stores, 1932 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 504 bytes stack frame, 300 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 352 bytes stack frame, 508 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 480 bytes stack frame, 880 bytes spill stores, 1936 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 752 bytes stack frame, 1824 bytes spill stores, 2848 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 1008 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 520 bytes stack frame, 368 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 336 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1412 bytes spill stores, 1996 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 520 bytes stack frame, 368 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 336 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1412 bytes spill stores, 1996 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 520 bytes stack frame, 368 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 336 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1412 bytes spill stores, 1996 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 424 bytes stack frame, 860 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 336 bytes stack frame, 468 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 312 bytes stack frame, 344 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1392 bytes spill stores, 2084 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 600 bytes stack frame, 472 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1496 bytes spill stores, 2408 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 592 bytes stack frame, 1328 bytes spill stores, 1928 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 600 bytes stack frame, 444 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 432 bytes stack frame, 512 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 576 bytes stack frame, 444 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 800 bytes stack frame, 1960 bytes spill stores, 2956 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 1044 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 592 bytes stack frame, 1340 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 496 bytes stack frame, 316 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 368 bytes stack frame, 428 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 520 bytes stack frame, 364 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 1908 bytes spill stores, 2756 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 740 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1408 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1408 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1408 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 424 bytes stack frame, 860 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 512 bytes stack frame, 320 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 312 bytes stack frame, 344 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 340 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1384 bytes spill stores, 2068 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 608 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 360 bytes stack frame, 516 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 592 bytes stack frame, 440 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1524 bytes spill stores, 2428 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 592 bytes stack frame, 1356 bytes spill stores, 1980 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 608 bytes stack frame, 468 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 376 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 456 bytes stack frame, 704 bytes spill stores, 1484 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 600 bytes stack frame, 444 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 200 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 824 bytes stack frame, 2048 bytes spill stores, 3028 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 416 bytes spill stores, 1132 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 592 bytes stack frame, 1428 bytes spill stores, 1912 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 504 bytes stack frame, 316 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 352 bytes stack frame, 516 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 408 bytes stack frame, 612 bytes spill stores, 1408 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 512 bytes stack frame, 352 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 832 bytes stack frame, 2196 bytes spill stores, 3028 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 756 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 560 bytes stack frame, 420 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1388 bytes spill stores, 1716 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 300 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 560 bytes stack frame, 420 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1388 bytes spill stores, 1716 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 300 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 560 bytes stack frame, 420 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 544 bytes stack frame, 376 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1388 bytes spill stores, 1716 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 300 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 400 bytes stack frame, 804 bytes spill stores, 1216 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 552 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1288 bytes spill stores, 1816 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 232 bytes stack frame, 260 bytes spill stores, 312 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 440 bytes stack frame, 964 bytes spill stores, 1488 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 600 bytes stack frame, 444 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 368 bytes stack frame, 540 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 384 bytes stack frame, 444 bytes spill stores, 928 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 592 bytes stack frame, 440 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1412 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 436 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 592 bytes stack frame, 1456 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 608 bytes stack frame, 452 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 384 bytes stack frame, 556 bytes spill stores, 752 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 480 bytes stack frame, 876 bytes spill stores, 1716 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 592 bytes stack frame, 444 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 208 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 736 bytes stack frame, 1840 bytes spill stores, 2828 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 416 bytes stack frame, 760 bytes spill stores, 2264 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 584 bytes stack frame, 1432 bytes spill stores, 1964 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 536 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 360 bytes stack frame, 548 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 448 bytes stack frame, 824 bytes spill stores, 1608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 744 bytes stack frame, 1760 bytes spill stores, 2572 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 964 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 408 bytes stack frame, 752 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1332 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 408 bytes stack frame, 752 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1332 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 408 bytes stack frame, 752 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1332 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 400 bytes stack frame, 756 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 336 bytes stack frame, 476 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 312 bytes stack frame, 344 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1340 bytes spill stores, 1848 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 440 bytes stack frame, 864 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 568 bytes stack frame, 412 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 600 bytes stack frame, 468 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1616 bytes spill stores, 2512 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 592 bytes stack frame, 1356 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 576 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 472 bytes stack frame, 832 bytes spill stores, 1712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 584 bytes stack frame, 460 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 784 bytes stack frame, 1904 bytes spill stores, 3000 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 392 bytes stack frame, 516 bytes spill stores, 1508 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 584 bytes stack frame, 1336 bytes spill stores, 1864 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 432 bytes stack frame, 592 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 760 bytes stack frame, 1800 bytes spill stores, 2708 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 780 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 408 bytes stack frame, 764 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2060 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 408 bytes stack frame, 764 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2060 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 408 bytes stack frame, 764 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 384 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2060 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 400 bytes stack frame, 744 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 528 bytes stack frame, 368 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 312 bytes stack frame, 344 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 544 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1368 bytes spill stores, 1924 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 432 bytes stack frame, 860 bytes spill stores, 1332 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 576 bytes stack frame, 420 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 360 bytes stack frame, 520 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 600 bytes stack frame, 448 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1664 bytes spill stores, 2552 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 616 bytes stack frame, 1416 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 376 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 496 bytes stack frame, 888 bytes spill stores, 1948 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 584 bytes stack frame, 424 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 200 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 800 bytes stack frame, 1920 bytes spill stores, 2980 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 408 bytes stack frame, 516 bytes spill stores, 1508 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 608 bytes stack frame, 1432 bytes spill stores, 1960 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 520 bytes stack frame, 356 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 352 bytes stack frame, 516 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 448 bytes stack frame, 828 bytes spill stores, 1660 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 340 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 1856 bytes spill stores, 2756 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 380 bytes spill stores, 900 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 416 bytes stack frame, 784 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 416 bytes stack frame, 784 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 416 bytes stack frame, 784 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1148 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1324 bytes spill stores, 1840 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 440 bytes stack frame, 948 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 576 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1392 bytes spill stores, 2052 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 632 bytes stack frame, 1540 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 384 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 472 bytes stack frame, 812 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 592 bytes stack frame, 472 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1648 bytes spill stores, 2532 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 468 bytes spill stores, 1204 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 616 bytes stack frame, 1504 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 496 bytes stack frame, 292 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 360 bytes stack frame, 548 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 392 bytes stack frame, 568 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1608 bytes spill stores, 2364 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 360 bytes spill stores, 760 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 416 bytes stack frame, 792 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 552 bytes stack frame, 408 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1276 bytes spill stores, 1632 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 416 bytes stack frame, 792 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 552 bytes stack frame, 408 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1276 bytes spill stores, 1632 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 416 bytes stack frame, 792 bytes spill stores, 1284 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 552 bytes stack frame, 408 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1276 bytes spill stores, 1632 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 400 bytes stack frame, 772 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 496 bytes stack frame, 312 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1260 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 440 bytes stack frame, 948 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1448 bytes spill stores, 1888 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 592 bytes stack frame, 1452 bytes spill stores, 1964 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 384 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 472 bytes stack frame, 812 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 592 bytes stack frame, 472 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1636 bytes spill stores, 2552 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 468 bytes spill stores, 1204 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 568 bytes stack frame, 1392 bytes spill stores, 1880 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 496 bytes stack frame, 304 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 360 bytes stack frame, 548 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 400 bytes stack frame, 576 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1632 bytes spill stores, 2436 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 360 bytes spill stores, 760 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 260 bytes spill stores, 456 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 408 bytes stack frame, 756 bytes spill stores, 1240 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 260 bytes spill stores, 456 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 408 bytes stack frame, 756 bytes spill stores, 1240 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 260 bytes spill stores, 456 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 408 bytes stack frame, 756 bytes spill stores, 1240 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 328 bytes stack frame, 532 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 544 bytes stack frame, 412 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 256 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 400 bytes stack frame, 752 bytes spill stores, 1120 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 512 bytes stack frame, 336 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 336 bytes stack frame, 476 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 312 bytes stack frame, 348 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1340 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 304 bytes spill stores, 512 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 432 bytes stack frame, 876 bytes spill stores, 1408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 568 bytes stack frame, 412 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 600 bytes stack frame, 464 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1600 bytes spill stores, 2476 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 300 bytes spill stores, 520 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 616 bytes stack frame, 1428 bytes spill stores, 2084 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 576 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 472 bytes stack frame, 820 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 592 bytes stack frame, 468 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 800 bytes stack frame, 2044 bytes spill stores, 3108 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 392 bytes stack frame, 516 bytes spill stores, 1356 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 248 bytes spill stores, 460 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 608 bytes stack frame, 1444 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 512 bytes stack frame, 348 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 392 bytes stack frame, 532 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 2024 bytes spill stores, 2916 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 772 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 260 bytes spill stores, 456 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 520 bytes stack frame, 368 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 336 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1412 bytes spill stores, 1996 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 260 bytes spill stores, 456 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 520 bytes stack frame, 368 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 336 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1412 bytes spill stores, 1996 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 260 bytes spill stores, 456 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 432 bytes stack frame, 856 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 520 bytes stack frame, 368 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 336 bytes stack frame, 532 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 368 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1412 bytes spill stores, 1996 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 572 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 256 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 424 bytes stack frame, 860 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 336 bytes stack frame, 468 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 312 bytes stack frame, 344 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1392 bytes spill stores, 2084 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 304 bytes spill stores, 512 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 352 bytes stack frame, 512 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 600 bytes stack frame, 472 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1496 bytes spill stores, 2408 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 300 bytes spill stores, 520 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 592 bytes stack frame, 1328 bytes spill stores, 1928 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 600 bytes stack frame, 444 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 432 bytes stack frame, 512 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 576 bytes stack frame, 444 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 800 bytes stack frame, 1960 bytes spill stores, 2956 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 1044 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 248 bytes spill stores, 460 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 592 bytes stack frame, 1340 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 496 bytes stack frame, 316 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 368 bytes stack frame, 428 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 520 bytes stack frame, 364 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 1908 bytes spill stores, 2756 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 740 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 416 bytes stack frame, 788 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1356 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 416 bytes stack frame, 788 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1356 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 416 bytes stack frame, 788 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1356 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 400 bytes stack frame, 776 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 512 bytes stack frame, 336 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 352 bytes stack frame, 500 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1324 bytes spill stores, 1820 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 440 bytes stack frame, 956 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 584 bytes stack frame, 432 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 608 bytes stack frame, 480 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1456 bytes spill stores, 2288 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 624 bytes stack frame, 1568 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 592 bytes stack frame, 436 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 384 bytes stack frame, 552 bytes spill stores, 752 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 472 bytes stack frame, 824 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 608 bytes stack frame, 484 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 208 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1636 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 384 bytes stack frame, 500 bytes spill stores, 1336 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 616 bytes stack frame, 1524 bytes spill stores, 2120 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 520 bytes stack frame, 356 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 368 bytes stack frame, 552 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 416 bytes stack frame, 760 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 664 bytes stack frame, 1600 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 772 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 416 bytes stack frame, 796 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 520 bytes stack frame, 380 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 352 bytes stack frame, 564 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1340 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 416 bytes stack frame, 796 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 520 bytes stack frame, 372 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 352 bytes stack frame, 564 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1340 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 416 bytes stack frame, 796 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 520 bytes stack frame, 372 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 352 bytes stack frame, 564 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1340 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 400 bytes stack frame, 780 bytes spill stores, 1168 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 512 bytes stack frame, 336 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 352 bytes stack frame, 500 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1320 bytes spill stores, 1816 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 440 bytes stack frame, 956 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 584 bytes stack frame, 432 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 608 bytes stack frame, 480 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1452 bytes spill stores, 1872 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 600 bytes stack frame, 1480 bytes spill stores, 1976 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 592 bytes stack frame, 436 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 384 bytes stack frame, 552 bytes spill stores, 752 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 472 bytes stack frame, 824 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 608 bytes stack frame, 484 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 208 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1680 bytes spill stores, 2572 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 384 bytes stack frame, 500 bytes spill stores, 1336 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 576 bytes stack frame, 1412 bytes spill stores, 1900 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 496 bytes stack frame, 324 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 368 bytes stack frame, 552 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 408 bytes stack frame, 752 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1612 bytes spill stores, 2340 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 772 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 408 bytes stack frame, 788 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 408 bytes stack frame, 788 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 408 bytes stack frame, 788 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 344 bytes stack frame, 552 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 280 bytes spill stores, 468 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 400 bytes stack frame, 772 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1260 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 176 bytes stack frame, 324 bytes spill stores, 524 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 440 bytes stack frame, 948 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1468 bytes spill stores, 1888 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 176 bytes stack frame, 324 bytes spill stores, 524 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 600 bytes stack frame, 1456 bytes spill stores, 1948 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 384 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 472 bytes stack frame, 812 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 584 bytes stack frame, 456 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1616 bytes spill stores, 2512 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 384 bytes stack frame, 500 bytes spill stores, 1336 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 272 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 576 bytes stack frame, 1400 bytes spill stores, 1880 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 496 bytes stack frame, 304 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 360 bytes stack frame, 548 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 400 bytes stack frame, 576 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1628 bytes spill stores, 2416 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 360 bytes spill stores, 760 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 408 bytes stack frame, 848 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1724 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 552 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 408 bytes stack frame, 848 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1724 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 552 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 408 bytes stack frame, 848 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1724 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 552 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 352 bytes stack frame, 496 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1260 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 440 bytes stack frame, 968 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 608 bytes stack frame, 476 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1468 bytes spill stores, 1888 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 692 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 600 bytes stack frame, 1420 bytes spill stores, 1896 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 384 bytes stack frame, 552 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 480 bytes stack frame, 852 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 592 bytes stack frame, 456 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 208 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1708 bytes spill stores, 2640 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 384 bytes stack frame, 560 bytes spill stores, 1412 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 584 bytes stack frame, 1364 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 496 bytes stack frame, 312 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 368 bytes stack frame, 560 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 432 bytes stack frame, 712 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1680 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 476 bytes spill stores, 1340 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 416 bytes stack frame, 784 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 416 bytes stack frame, 784 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 416 bytes stack frame, 784 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 528 bytes stack frame, 376 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 152 bytes stack frame, 280 bytes spill stores, 468 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1148 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 344 bytes stack frame, 496 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1324 bytes spill stores, 1840 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 176 bytes stack frame, 324 bytes spill stores, 524 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 440 bytes stack frame, 948 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 576 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1392 bytes spill stores, 2052 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 176 bytes stack frame, 324 bytes spill stores, 524 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 632 bytes stack frame, 1540 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 384 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 472 bytes stack frame, 812 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 592 bytes stack frame, 472 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1648 bytes spill stores, 2532 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 468 bytes spill stores, 1204 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 272 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 616 bytes stack frame, 1504 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 496 bytes stack frame, 292 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 360 bytes stack frame, 548 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 392 bytes stack frame, 568 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1608 bytes spill stores, 2364 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 360 bytes spill stores, 760 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 408 bytes stack frame, 860 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 320 bytes stack frame, 372 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1724 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 552 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 408 bytes stack frame, 860 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 320 bytes stack frame, 372 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1724 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 552 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 408 bytes stack frame, 860 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 520 bytes stack frame, 364 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 320 bytes stack frame, 372 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1352 bytes spill stores, 1724 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 552 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 504 bytes stack frame, 320 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 352 bytes stack frame, 500 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1260 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 440 bytes stack frame, 968 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 376 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 608 bytes stack frame, 476 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1468 bytes spill stores, 1888 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 692 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 600 bytes stack frame, 1412 bytes spill stores, 1880 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 384 bytes stack frame, 552 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 480 bytes stack frame, 844 bytes spill stores, 1740 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 592 bytes stack frame, 456 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 696 bytes stack frame, 1672 bytes spill stores, 2568 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 384 bytes stack frame, 552 bytes spill stores, 1404 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 584 bytes stack frame, 1364 bytes spill stores, 1824 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 496 bytes stack frame, 312 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 368 bytes stack frame, 560 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 416 bytes stack frame, 692 bytes spill stores, 1432 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1664 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 420 bytes spill stores, 1180 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 488 bytes stack frame, 1092 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 616 bytes stack frame, 496 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 344 bytes stack frame, 408 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 536 bytes stack frame, 404 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 408 bytes stack frame, 812 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 528 bytes stack frame, 388 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 552 bytes stack frame, 420 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1356 bytes spill stores, 1720 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 488 bytes stack frame, 1092 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 616 bytes stack frame, 496 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 344 bytes stack frame, 408 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 536 bytes stack frame, 404 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 528 bytes stack frame, 1148 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 608 bytes stack frame, 476 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1316 bytes spill stores, 1844 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 440 bytes stack frame, 928 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 592 bytes stack frame, 436 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 624 bytes stack frame, 508 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1408 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 608 bytes stack frame, 1568 bytes spill stores, 2232 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 376 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 456 bytes stack frame, 672 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1692 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 428 bytes spill stores, 984 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 584 bytes stack frame, 1436 bytes spill stores, 2028 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 488 bytes stack frame, 296 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 416 bytes stack frame, 516 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 1724 bytes spill stores, 2616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 740 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 408 bytes stack frame, 816 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 544 bytes stack frame, 428 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 384 bytes stack frame, 516 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 368 bytes stack frame, 452 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1536 bytes spill stores, 2120 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 328 bytes spill stores, 636 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 408 bytes stack frame, 816 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 544 bytes stack frame, 432 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 384 bytes stack frame, 516 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 368 bytes stack frame, 452 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1536 bytes spill stores, 2120 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 328 bytes spill stores, 636 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 408 bytes stack frame, 816 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 544 bytes stack frame, 432 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 384 bytes stack frame, 516 bytes spill stores, 692 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 368 bytes stack frame, 452 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1536 bytes spill stores, 2120 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 296 bytes stack frame, 328 bytes spill stores, 636 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 400 bytes stack frame, 804 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 520 bytes stack frame, 380 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 376 bytes stack frame, 484 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 360 bytes stack frame, 424 bytes spill stores, 884 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1384 bytes spill stores, 1912 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 336 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 432 bytes stack frame, 864 bytes spill stores, 1332 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 576 bytes stack frame, 412 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 400 bytes stack frame, 548 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 408 bytes stack frame, 476 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 608 bytes stack frame, 464 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 224 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1624 bytes spill stores, 2528 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 360 bytes spill stores, 700 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 616 bytes stack frame, 1456 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 576 bytes stack frame, 424 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 400 bytes stack frame, 556 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 520 bytes stack frame, 976 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 600 bytes stack frame, 468 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 232 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 800 bytes stack frame, 2220 bytes spill stores, 3404 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 536 bytes spill stores, 1420 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 608 bytes stack frame, 1500 bytes spill stores, 2084 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 536 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 408 bytes stack frame, 520 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 464 bytes stack frame, 864 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 536 bytes stack frame, 388 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 232 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 2116 bytes spill stores, 3176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 396 bytes spill stores, 1048 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 416 bytes stack frame, 816 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 544 bytes stack frame, 396 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 728 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 416 bytes stack frame, 816 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 544 bytes stack frame, 396 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 728 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 416 bytes stack frame, 816 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 544 bytes stack frame, 396 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 352 bytes stack frame, 412 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1516 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 728 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 400 bytes stack frame, 748 bytes spill stores, 1096 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 520 bytes stack frame, 336 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 472 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 320 bytes stack frame, 356 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1352 bytes spill stores, 1868 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 316 bytes spill stores, 624 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 432 bytes stack frame, 832 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 576 bytes stack frame, 420 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 376 bytes stack frame, 512 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 384 bytes stack frame, 444 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 608 bytes stack frame, 464 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 200 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1616 bytes spill stores, 2492 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 364 bytes spill stores, 708 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 616 bytes stack frame, 1460 bytes spill stores, 2148 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 576 bytes stack frame, 424 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 376 bytes stack frame, 528 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 488 bytes stack frame, 908 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 600 bytes stack frame, 468 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 208 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 800 bytes stack frame, 2140 bytes spill stores, 3280 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 508 bytes spill stores, 1392 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 608 bytes stack frame, 1476 bytes spill stores, 2032 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 536 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 384 bytes stack frame, 540 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 448 bytes stack frame, 736 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 216 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 2036 bytes spill stores, 3052 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 360 bytes stack frame, 692 bytes spill stores, 1908 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 480 bytes stack frame, 1056 bytes spill stores, 1612 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 616 bytes stack frame, 508 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1400 bytes spill stores, 1800 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 456 bytes stack frame, 936 bytes spill stores, 1408 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 528 bytes stack frame, 380 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 320 bytes stack frame, 364 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 552 bytes stack frame, 412 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1276 bytes spill stores, 1688 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 480 bytes stack frame, 1056 bytes spill stores, 1612 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 616 bytes stack frame, 508 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1316 bytes spill stores, 1716 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 520 bytes stack frame, 1116 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 616 bytes stack frame, 512 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 344 bytes stack frame, 480 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 536 bytes stack frame, 368 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1240 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 480 bytes stack frame, 1036 bytes spill stores, 1548 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 592 bytes stack frame, 432 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 368 bytes stack frame, 528 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 616 bytes stack frame, 504 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1396 bytes spill stores, 1856 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 648 bytes stack frame, 1676 bytes spill stores, 2464 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 592 bytes stack frame, 436 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 376 bytes stack frame, 544 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 488 bytes stack frame, 876 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 616 bytes stack frame, 508 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 1700 bytes spill stores, 2748 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 436 bytes spill stores, 1156 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 624 bytes stack frame, 1584 bytes spill stores, 2276 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 512 bytes stack frame, 336 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 432 bytes stack frame, 680 bytes spill stores, 1384 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 1732 bytes spill stores, 2624 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 800 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 528 bytes stack frame, 1216 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 552 bytes stack frame, 404 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 408 bytes stack frame, 596 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1504 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 400 bytes spill stores, 956 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 528 bytes stack frame, 1216 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 552 bytes stack frame, 404 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 408 bytes stack frame, 596 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1504 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 400 bytes spill stores, 956 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 528 bytes stack frame, 1216 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 552 bytes stack frame, 404 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 408 bytes stack frame, 596 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 536 bytes stack frame, 380 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1504 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 400 bytes spill stores, 956 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 544 bytes stack frame, 1192 bytes spill stores, 1716 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 528 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 344 bytes stack frame, 488 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 400 bytes stack frame, 492 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 544 bytes stack frame, 404 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1460 bytes spill stores, 2072 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 336 bytes stack frame, 400 bytes spill stores, 948 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 560 bytes stack frame, 1372 bytes spill stores, 2008 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 584 bytes stack frame, 428 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 464 bytes stack frame, 780 bytes spill stores, 1592 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 576 bytes stack frame, 440 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1612 bytes spill stores, 2508 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 376 bytes stack frame, 500 bytes spill stores, 1116 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 1120 bytes stack frame, 3052 bytes spill stores, 4020 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 592 bytes stack frame, 432 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 376 bytes stack frame, 556 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 776 bytes stack frame, 1268 bytes spill stores, 2560 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 576 bytes stack frame, 432 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 864 bytes stack frame, 3240 bytes spill stores, 5836 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 696 bytes stack frame, 1352 bytes spill stores, 2968 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 1160 bytes stack frame, 3536 bytes spill stores, 4352 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 536 bytes stack frame, 364 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 360 bytes stack frame, 536 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 712 bytes stack frame, 1172 bytes spill stores, 2216 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 992 bytes stack frame, 4092 bytes spill stores, 6660 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1240 bytes spill stores, 2456 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 488 bytes stack frame, 1092 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 616 bytes stack frame, 496 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 344 bytes stack frame, 408 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 536 bytes stack frame, 404 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 408 bytes stack frame, 812 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 528 bytes stack frame, 388 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 552 bytes stack frame, 420 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1356 bytes spill stores, 1720 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 488 bytes stack frame, 1092 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 616 bytes stack frame, 496 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 344 bytes stack frame, 408 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 536 bytes stack frame, 404 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1348 bytes spill stores, 1700 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 272 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 528 bytes stack frame, 1148 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 608 bytes stack frame, 476 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 344 bytes stack frame, 392 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1316 bytes spill stores, 1844 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 308 bytes spill stores, 568 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 440 bytes stack frame, 928 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 592 bytes stack frame, 436 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 624 bytes stack frame, 508 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1408 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 352 bytes spill stores, 688 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 160 bytes stack frame, 320 bytes spill stores, 560 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 608 bytes stack frame, 1568 bytes spill stores, 2232 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 376 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 456 bytes stack frame, 672 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1692 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 428 bytes spill stores, 984 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 272 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 584 bytes stack frame, 1436 bytes spill stores, 2028 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 488 bytes stack frame, 296 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 416 bytes stack frame, 516 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 1724 bytes spill stores, 2616 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 740 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 472 bytes stack frame, 1024 bytes spill stores, 1596 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 616 bytes stack frame, 508 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 328 bytes stack frame, 376 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1392 bytes spill stores, 1676 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 472 bytes stack frame, 1036 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 616 bytes stack frame, 496 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 328 bytes stack frame, 376 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1392 bytes spill stores, 1676 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 472 bytes stack frame, 1036 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 616 bytes stack frame, 508 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 328 bytes stack frame, 376 bytes spill stores, 836 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 536 bytes stack frame, 400 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1392 bytes spill stores, 1676 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 448 bytes stack frame, 1016 bytes spill stores, 1488 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 608 bytes stack frame, 472 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 352 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1272 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 488 bytes stack frame, 1124 bytes spill stores, 1736 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 664 bytes stack frame, 556 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 384 bytes stack frame, 440 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1416 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 392 bytes spill stores, 748 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 584 bytes stack frame, 1540 bytes spill stores, 2440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 600 bytes stack frame, 444 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 384 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 520 bytes stack frame, 988 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 600 bytes stack frame, 464 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 2016 bytes spill stores, 3084 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 416 bytes stack frame, 804 bytes spill stores, 2328 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 552 bytes stack frame, 1520 bytes spill stores, 2232 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 488 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 472 bytes stack frame, 928 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 520 bytes stack frame, 368 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 824 bytes stack frame, 2092 bytes spill stores, 3028 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 408 bytes spill stores, 1056 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 296 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 576 bytes stack frame, 1328 bytes spill stores, 2004 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 512 bytes stack frame, 356 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 488 bytes spill stores, 984 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 520 bytes stack frame, 360 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1568 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 592 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 296 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 576 bytes stack frame, 1328 bytes spill stores, 2004 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 512 bytes stack frame, 356 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 488 bytes spill stores, 984 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 520 bytes stack frame, 360 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1568 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 592 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 296 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 576 bytes stack frame, 1328 bytes spill stores, 2004 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 512 bytes stack frame, 356 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 352 bytes stack frame, 488 bytes spill stores, 984 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 520 bytes stack frame, 360 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1568 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 592 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 276 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 560 bytes stack frame, 1304 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 496 bytes stack frame, 308 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 344 bytes stack frame, 480 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 344 bytes stack frame, 424 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 512 bytes stack frame, 356 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1448 bytes spill stores, 2088 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 648 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 608 bytes stack frame, 1480 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 576 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 424 bytes stack frame, 524 bytes spill stores, 1064 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 568 bytes stack frame, 424 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1544 bytes spill stores, 2368 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 396 bytes spill stores, 816 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 160 bytes stack frame, 324 bytes spill stores, 572 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 1032 bytes stack frame, 2924 bytes spill stores, 3972 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 584 bytes stack frame, 424 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 376 bytes stack frame, 548 bytes spill stores, 740 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 728 bytes stack frame, 1160 bytes spill stores, 2420 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 560 bytes stack frame, 416 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 3304 bytes spill stores, 5876 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1236 bytes spill stores, 2616 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 276 bytes spill stores, 496 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 1016 bytes stack frame, 3048 bytes spill stores, 3904 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 488 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 648 bytes stack frame, 1188 bytes spill stores, 2236 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 504 bytes stack frame, 340 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 3852 bytes spill stores, 6288 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1240 bytes spill stores, 2456 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 456 bytes stack frame, 1036 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 528 bytes stack frame, 372 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 476 bytes spill stores, 984 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1636 bytes spill stores, 2292 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 592 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 456 bytes stack frame, 1036 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 528 bytes stack frame, 372 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 476 bytes spill stores, 984 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1636 bytes spill stores, 2292 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 592 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 456 bytes stack frame, 1036 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 528 bytes stack frame, 372 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 476 bytes spill stores, 984 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 528 bytes stack frame, 376 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1636 bytes spill stores, 2292 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 328 bytes spill stores, 592 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 456 bytes stack frame, 1024 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 512 bytes stack frame, 336 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 344 bytes stack frame, 480 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 520 bytes stack frame, 364 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1528 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 644 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 488 bytes stack frame, 1140 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 568 bytes stack frame, 412 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 416 bytes stack frame, 532 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 568 bytes stack frame, 424 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1616 bytes spill stores, 2432 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 396 bytes spill stores, 816 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 1032 bytes stack frame, 3012 bytes spill stores, 4040 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 576 bytes stack frame, 416 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 376 bytes stack frame, 548 bytes spill stores, 740 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 720 bytes stack frame, 1196 bytes spill stores, 2436 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 568 bytes stack frame, 424 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 952 bytes stack frame, 3332 bytes spill stores, 5908 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1252 bytes spill stores, 2624 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 1024 bytes stack frame, 2996 bytes spill stores, 3848 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 504 bytes stack frame, 324 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 664 bytes stack frame, 1228 bytes spill stores, 2256 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 512 bytes stack frame, 348 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 944 bytes stack frame, 3856 bytes spill stores, 6324 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1240 bytes spill stores, 2456 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 472 bytes stack frame, 980 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 688 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1372 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 472 bytes stack frame, 980 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 688 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1372 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 472 bytes stack frame, 980 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 688 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1372 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 712 bytes stack frame, 304 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1256 bytes spill stores, 1652 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 488 bytes stack frame, 1080 bytes spill stores, 1656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 736 bytes stack frame, 344 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1396 bytes spill stores, 1900 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 392 bytes spill stores, 748 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 608 bytes stack frame, 1556 bytes spill stores, 2188 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 376 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 456 bytes stack frame, 672 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1692 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 428 bytes spill stores, 984 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 584 bytes stack frame, 1436 bytes spill stores, 2028 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 488 bytes stack frame, 296 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 416 bytes stack frame, 516 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 1724 bytes spill stores, 2616 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 740 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 576 bytes stack frame, 1468 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 848 bytes stack frame, 440 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 464 bytes spill stores, 1036 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1580 bytes spill stores, 2272 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 332 bytes spill stores, 600 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 576 bytes stack frame, 1468 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 848 bytes stack frame, 440 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 464 bytes spill stores, 1036 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1580 bytes spill stores, 2272 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 332 bytes spill stores, 600 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 576 bytes stack frame, 1468 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 848 bytes stack frame, 440 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 464 bytes spill stores, 1036 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 528 bytes stack frame, 380 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1580 bytes spill stores, 2272 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 288 bytes stack frame, 332 bytes spill stores, 600 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 624 bytes stack frame, 1464 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 856 bytes stack frame, 452 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 352 bytes stack frame, 504 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 428 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 520 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1468 bytes spill stores, 2216 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 332 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 632 bytes stack frame, 1624 bytes spill stores, 2372 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 888 bytes stack frame, 492 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 384 bytes stack frame, 544 bytes spill stores, 752 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 416 bytes stack frame, 512 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 584 bytes stack frame, 444 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1636 bytes spill stores, 2776 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 392 bytes spill stores, 812 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 1384 bytes stack frame, 3940 bytes spill stores, 4872 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 896 bytes stack frame, 500 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 384 bytes stack frame, 552 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 616 bytes stack frame, 1208 bytes spill stores, 2432 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 592 bytes stack frame, 456 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 3264 bytes spill stores, 5736 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1212 bytes spill stores, 2836 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 1360 bytes stack frame, 4048 bytes spill stores, 4804 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 824 bytes stack frame, 436 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 552 bytes stack frame, 1180 bytes spill stores, 2228 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 808 bytes stack frame, 3660 bytes spill stores, 6096 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1224 bytes spill stores, 2448 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 472 bytes stack frame, 980 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 688 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1372 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 472 bytes stack frame, 980 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 688 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1372 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 472 bytes stack frame, 980 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 688 bytes stack frame, 284 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 528 bytes stack frame, 388 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1372 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 144 bytes stack frame, 272 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_35' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 712 bytes stack frame, 304 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 344 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 528 bytes stack frame, 384 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1256 bytes spill stores, 1652 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 488 bytes stack frame, 1080 bytes spill stores, 1656 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 736 bytes stack frame, 344 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 360 bytes stack frame, 524 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 376 bytes stack frame, 432 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 592 bytes stack frame, 460 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1396 bytes spill stores, 1900 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 392 bytes spill stores, 748 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 160 bytes stack frame, 320 bytes spill stores, 560 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 608 bytes stack frame, 1556 bytes spill stores, 2188 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 600 bytes stack frame, 440 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 376 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 456 bytes stack frame, 672 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 600 bytes stack frame, 480 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1692 bytes spill stores, 2644 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 352 bytes stack frame, 428 bytes spill stores, 984 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LLP11ncclDevCommmP8ncclWork 136 bytes stack frame, 272 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LLP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 584 bytes stack frame, 1436 bytes spill stores, 2028 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 488 bytes stack frame, 296 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 416 bytes stack frame, 516 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 536 bytes stack frame, 396 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 1724 bytes spill stores, 2616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 740 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 472 bytes stack frame, 984 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 680 bytes stack frame, 276 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 328 bytes stack frame, 376 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1388 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 472 bytes stack frame, 984 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 680 bytes stack frame, 276 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 328 bytes stack frame, 376 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1388 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 472 bytes stack frame, 984 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 680 bytes stack frame, 276 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 328 bytes stack frame, 376 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 384 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1388 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 688 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 344 bytes stack frame, 476 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 352 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 536 bytes stack frame, 392 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1284 bytes spill stores, 1800 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 488 bytes stack frame, 1080 bytes spill stores, 1656 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 736 bytes stack frame, 340 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 376 bytes stack frame, 436 bytes spill stores, 908 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 584 bytes stack frame, 448 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1428 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 392 bytes spill stores, 748 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 584 bytes stack frame, 1540 bytes spill stores, 2440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 600 bytes stack frame, 444 bytes spill stores, 588 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 384 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 520 bytes stack frame, 988 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 600 bytes stack frame, 464 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 2016 bytes spill stores, 3084 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 416 bytes stack frame, 804 bytes spill stores, 2328 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 552 bytes stack frame, 1520 bytes spill stores, 2232 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 488 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 472 bytes stack frame, 928 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 520 bytes stack frame, 368 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 824 bytes stack frame, 2092 bytes spill stores, 3028 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 408 bytes spill stores, 1056 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 696 bytes stack frame, 1692 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 864 bytes stack frame, 436 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 408 bytes stack frame, 608 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 560 bytes stack frame, 428 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1664 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 416 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 696 bytes stack frame, 1692 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 864 bytes stack frame, 436 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 408 bytes stack frame, 608 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 552 bytes stack frame, 396 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1664 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 416 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 696 bytes stack frame, 1692 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 864 bytes stack frame, 436 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 408 bytes stack frame, 608 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 552 bytes stack frame, 396 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1664 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 416 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 608 bytes stack frame, 1532 bytes spill stores, 2208 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 864 bytes stack frame, 444 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 352 bytes stack frame, 496 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 480 bytes stack frame, 700 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1508 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 776 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 656 bytes stack frame, 1780 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 888 bytes stack frame, 520 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 384 bytes stack frame, 544 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 448 bytes stack frame, 584 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 600 bytes stack frame, 464 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1628 bytes spill stores, 2456 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 1068 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 1168 bytes stack frame, 3844 bytes spill stores, 4856 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 896 bytes stack frame, 524 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 384 bytes stack frame, 544 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 640 bytes stack frame, 1152 bytes spill stores, 2392 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 600 bytes stack frame, 456 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 3448 bytes spill stores, 5972 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 520 bytes stack frame, 1340 bytes spill stores, 2936 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 1152 bytes stack frame, 3920 bytes spill stores, 4756 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 808 bytes stack frame, 388 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 576 bytes stack frame, 1180 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 4004 bytes spill stores, 6428 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 512 bytes stack frame, 1340 bytes spill stores, 2764 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 680 bytes stack frame, 1684 bytes spill stores, 2384 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 864 bytes stack frame, 436 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 408 bytes stack frame, 608 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 560 bytes stack frame, 428 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1660 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 420 bytes spill stores, 852 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 680 bytes stack frame, 1684 bytes spill stores, 2384 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 864 bytes stack frame, 436 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 408 bytes stack frame, 608 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 552 bytes stack frame, 396 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1660 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 420 bytes spill stores, 852 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 680 bytes stack frame, 1684 bytes spill stores, 2384 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 864 bytes stack frame, 436 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 408 bytes stack frame, 608 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 552 bytes stack frame, 396 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1660 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 328 bytes stack frame, 420 bytes spill stores, 852 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 608 bytes stack frame, 1528 bytes spill stores, 2204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 864 bytes stack frame, 444 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 352 bytes stack frame, 496 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 480 bytes stack frame, 680 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 544 bytes stack frame, 396 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1488 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 312 bytes stack frame, 360 bytes spill stores, 760 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 664 bytes stack frame, 1780 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 888 bytes stack frame, 520 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 384 bytes stack frame, 544 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 448 bytes stack frame, 584 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 600 bytes stack frame, 464 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1612 bytes spill stores, 2420 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 448 bytes spill stores, 1052 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 1168 bytes stack frame, 3832 bytes spill stores, 4896 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 896 bytes stack frame, 524 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 384 bytes stack frame, 544 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 640 bytes stack frame, 1152 bytes spill stores, 2392 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 600 bytes stack frame, 456 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 3452 bytes spill stores, 5980 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 528 bytes stack frame, 1380 bytes spill stores, 2732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 1152 bytes stack frame, 3916 bytes spill stores, 4744 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 808 bytes stack frame, 388 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 576 bytes stack frame, 1180 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 4000 bytes spill stores, 6424 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 512 bytes stack frame, 1344 bytes spill stores, 2768 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 472 bytes stack frame, 972 bytes spill stores, 1472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 688 bytes stack frame, 304 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 344 bytes stack frame, 416 bytes spill stores, 948 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1424 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 472 bytes stack frame, 972 bytes spill stores, 1472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 688 bytes stack frame, 304 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 344 bytes stack frame, 416 bytes spill stores, 948 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1424 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 472 bytes stack frame, 972 bytes spill stores, 1472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 688 bytes stack frame, 304 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 344 bytes stack frame, 416 bytes spill stores, 948 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 584 bytes stack frame, 1424 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 320 bytes spill stores, 664 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 448 bytes stack frame, 960 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 712 bytes stack frame, 312 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 352 bytes stack frame, 488 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 328 bytes stack frame, 388 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 544 bytes stack frame, 392 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1320 bytes spill stores, 1840 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 280 bytes stack frame, 328 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 488 bytes stack frame, 1084 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 744 bytes stack frame, 348 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 368 bytes stack frame, 524 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 384 bytes stack frame, 456 bytes spill stores, 948 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 592 bytes stack frame, 456 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1480 bytes spill stores, 2144 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 392 bytes spill stores, 748 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 648 bytes stack frame, 1676 bytes spill stores, 2452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 592 bytes stack frame, 436 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 376 bytes stack frame, 544 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 488 bytes stack frame, 876 bytes spill stores, 1696 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 616 bytes stack frame, 508 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 208 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 736 bytes stack frame, 1776 bytes spill stores, 2784 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 368 bytes stack frame, 436 bytes spill stores, 1156 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 624 bytes stack frame, 1584 bytes spill stores, 2276 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 512 bytes stack frame, 336 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 432 bytes stack frame, 680 bytes spill stores, 1384 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 544 bytes stack frame, 388 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 1732 bytes spill stores, 2624 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 320 bytes stack frame, 368 bytes spill stores, 800 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' /usr/local/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all --allow-unsupported-compiler -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.19.3-1/build/include -I../include --compiler-options "-fPIC -fvisibility=hidden" -dlink /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/common.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/onerank.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_gather.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_i32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_i64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_prod_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sum_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/broadcast.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_prod_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_i32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_i64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_prod_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sum_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_f16.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_f32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_f64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sum_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/reduce_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/sendrecv.cu.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/host_table.cc.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/genobj/device_table.cu.o -o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/device_glue.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src/device' nvcc warning : The 'compute_35', 'compute_37', 'sm_35', and 'sm_37' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Archiving libnccl_static.a > /builddir/build/BUILD/nccl-2.19.3-1/build/lib/libnccl_static.a mkdir -p /builddir/build/BUILD/nccl-2.19.3-1/build/lib ar cr /builddir/build/BUILD/nccl-2.19.3-1/build/lib/libnccl_static.a /builddir/build/BUILD/nccl-2.19.3-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/collectives.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/group.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/net.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/tuner.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enhcompat.o $(cat /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/manifest) make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.19.3-1/src' Linking libnccl.so.2.19.3 > /builddir/build/BUILD/nccl-2.19.3-1/build/lib/libnccl.so.2.19.3 mkdir -p /builddir/build/BUILD/nccl-2.19.3-1/build/lib g++ -DCUDA_MAJOR=11 -DCUDA_MINOR=8 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -O3 -g -Wall -Wextra -DPROFAPI -shared -Wl,--no-as-needed -Wl,-soname,libnccl.so.2 -o /builddir/build/BUILD/nccl-2.19.3-1/build/lib/libnccl.so.2.19.3 /builddir/build/BUILD/nccl-2.19.3-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/collectives.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/group.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/net.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/tuner.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.19.3-1/build/obj/enhcompat.o $(cat /builddir/build/BUILD/nccl-2.19.3-1/build/obj/device/manifest) -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 -L/usr/local/cuda/lib64 -lcudart_static -lpthread -lrt -ldl ln -sf libnccl.so.2 /builddir/build/BUILD/nccl-2.19.3-1/build/lib/libnccl.so ln -sf libnccl.so.2.19.3 /builddir/build/BUILD/nccl-2.19.3-1/build/lib/libnccl.so.2 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.19.3-1/src' + RPM_EC=0 ++ jobs -p + exit 0 Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.O0VmiX + umask 022 + cd /builddir/build/BUILD + '[' /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64 '!=' / ']' + rm -rf /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64 ++ dirname /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64 + mkdir -p /builddir/build/BUILDROOT + mkdir /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64 + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.19.3-1 + mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64 + mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/include + mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/lib64 + mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/lib64/pkgconfig + cp -d build/lib/libnccl.so build/lib/libnccl.so.2 build/lib/libnccl.so.2.19.3 build/lib/libnccl_static.a /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/lib64 + cp build/include/nccl.h build/include/nccl_net.h /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/include + cp build/lib/pkgconfig/nccl.pc /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/lib64/pkgconfig/ + /usr/bin/find-debuginfo -j80 --strict-build-id -m -i --build-id-seed 2.19.3-2.an23 --unique-debug-suffix -2.19.3-2.an23.aarch64 --unique-debug-src-base libnccl-cuda-11-2.19.3-2.an23.aarch64 --run-dwz --dwz-low-mem-die-limit 10000000 --dwz-max-die-limit 50000000 -S debugsourcefiles.list /builddir/build/BUILD/nccl-2.19.3-1 extracting debug info from /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/lib64/libnccl.so.2.19.3 original debug info size: 7656kB, size after compression: 6596kB /usr/bin/sepdebugcrcfix: Updated 1 CRC32s, 0 CRC32s did match. 2993 blocks + /usr/lib/rpm/check-buildroot + /usr/lib/rpm/anolis/brp-ldconfig + COMPRESS='zstd -f --rm -19 -T0' + COMPRESS_EXT=.zst + /usr/lib/rpm/brp-compress + /usr/lib/rpm/anolis/brp-strip-lto /usr/bin/strip + /usr/lib/rpm/brp-strip-static-archive /usr/bin/strip + /usr/lib/rpm/check-rpaths + /usr/lib/rpm/brp-remove-la-files + /usr/lib/rpm/anolis/clean_perl + /usr/lib/rpm/anolis/check_elf_files + /usr/lib/rpm/anolis/brp-mangle-shebangs + /usr/lib/rpm/anolis/remove-info-dir + /usr/lib/rpm/anolis/check-desktop-files + /usr/lib/rpm/anolis/brp-python-bytecompile '' 1 0 + /usr/lib/rpm/anolis/brp-python-hardlink Processing files: libnccl-cuda-11-2.19.3-2.an23.aarch64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.4JJJEn + umask 022 + cd /builddir/build/BUILD + cd nccl-2.19.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-cuda-11 + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-cuda-11 + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-cuda-11 + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-cuda-11 = 2.19.3-2.an23 libnccl-cuda-11(aarch-64) = 2.19.3-2.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Conflicts: libnccl Processing files: libnccl-devel-cuda-11-2.19.3-2.an23.aarch64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.zn3JkS + umask 022 + cd /builddir/build/BUILD + cd nccl-2.19.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-devel-cuda-11 + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-devel-cuda-11 + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-devel-cuda-11 + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-devel-cuda-11 = 2.19.3-2.an23 libnccl-devel-cuda-11(aarch-64) = 2.19.3-2.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Conflicts: libnccl-devel Processing files: libnccl-static-cuda-11-2.19.3-2.an23.aarch64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.285JXO + umask 022 + cd /builddir/build/BUILD + cd nccl-2.19.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-static-cuda-11 + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-static-cuda-11 + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64/usr/share/licenses/libnccl-static-cuda-11 + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-static-cuda-11 = 2.19.3-2.an23 libnccl-static-cuda-11(aarch-64) = 2.19.3-2.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Conflicts: libnccl-static Processing files: libnccl-cuda-11-debugsource-2.19.3-2.an23.aarch64 Provides: libnccl-cuda-11-debugsource = 2.19.3-2.an23 libnccl-cuda-11-debugsource(aarch-64) = 2.19.3-2.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-cuda-11-debuginfo-2.19.3-2.an23.aarch64 Provides: debuginfo(build-id) = 9ada65230fa454f343755dad1bf4b1270abe5781 libnccl-cuda-11-debuginfo = 2.19.3-2.an23 libnccl-cuda-11-debuginfo(aarch-64) = 2.19.3-2.an23 libnccl.so.2.19.3-2.19.3-2.an23.aarch64.debug()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Recommends: libnccl-cuda-11-debugsource(aarch-64) = 2.19.3-2.an23 Checking for unpackaged file(s): /usr/lib/rpm/check-files /builddir/build/BUILDROOT/libnccl-cuda-11-2.19.3-2.an23.aarch64 Wrote: /builddir/build/RPMS/libnccl-devel-cuda-11-2.19.3-2.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-cuda-11-debugsource-2.19.3-2.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-cuda-11-debuginfo-2.19.3-2.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-cuda-11-2.19.3-2.an23.aarch64.rpm Wrote: /builddir/build/RPMS/libnccl-static-cuda-11-2.19.3-2.an23.aarch64.rpm Child return code was: 0