Mock Version: 5.5 Mock Version: 5.5 Mock Version: 5.5 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-395048-72172/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=990gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1725235200 Wrote: /builddir/build/SRPMS/libnccl-2.22.3-1.cuda12.6.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-395048-72172/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=990gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1725235200 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.sMP4f0 + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf nccl-2.22.3-1 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/nccl-2.22.3-1.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd nccl-2.22.3-1 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + /usr/lib/rpm/rpmuncompress /builddir/build/SOURCES/1000-fix-lib-path-in-nccl.pc.patch + /usr/bin/patch -p1 -s --fuzz=0 --no-backup-if-mismatch -f + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.WBp1tx + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.22.3-1 + export LD_LIBRARY_PATH=/usr/local/cuda-12-6/lib64 + LD_LIBRARY_PATH=/usr/local/cuda-12-6/lib64 + export 'CFLAGS=usr/local/cuda-12-6/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + CFLAGS='usr/local/cuda-12-6/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export PREFIX=/usr + PREFIX=/usr + /usr/bin/make -O -j80 V=1 VERBOSE=1 /usr/bin/make -C src build BUILDDIR=/builddir/build/BUILD/nccl-2.22.3-1/build make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Grabbing include/nccl_net.h > /builddir/build/BUILD/nccl-2.22.3-1/build/include/nccl_net.h mkdir -p /builddir/build/BUILD/nccl-2.22.3-1/build/include install -m 644 include/nccl_net.h /builddir/build/BUILD/nccl-2.22.3-1/build/include/nccl_net.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' mkdir -p /builddir/build/BUILD/nccl-2.22.3-1/build/include Generating nccl.h.in > /builddir/build/BUILD/nccl-2.22.3-1/build/include/nccl.h sed -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/22/g" \ -e "s/\${nccl:Patch}/3/g" \ -e "s/\${nccl:Suffix}//g" \ -e "s/\${nccl:Version}/22203/g" \ nccl.h.in > /builddir/build/BUILD/nccl-2.22.3-1/build/include/nccl.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' mkdir -p /builddir/build/BUILD/nccl-2.22.3-1/build/lib/pkgconfig Generating nccl.pc.in > /builddir/build/BUILD/nccl-2.22.3-1/build/lib/pkgconfig/nccl.pc sed -e 's|${nccl:Prefix}|\/usr|g' \ -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/22/g" \ -e "s/\${nccl:Patch}/3/g" \ nccl.pc.in > /builddir/build/BUILD/nccl-2.22.3-1/build/lib/pkgconfig/nccl.pc make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' /usr/bin/make -C ./device make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling enhcompat.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enhcompat.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enhcompat.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enhcompat.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enhcompat.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' (which python3 >/dev/null || \ (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \ printf "\n${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n${bar}\n\n" 1>&2; \ exit 1)) \ && ./generate.py /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/gensrc "" make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/cudawrap.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/cudawrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/cudawrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/cudawrap.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/cudawrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/host_table.cc make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/param.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/param.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/param.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/param.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/param.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/strongstream.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/strongstream.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/strongstream.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/strongstream.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/strongstream.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/trees.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/trees.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/trees.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/trees.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/trees.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/host_table.cc make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' In file included from /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/gensrc/host_table.cc:1: ../include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': ../include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ ../include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': ../include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ ../include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': ../include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/nvmlwrap.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/nvmlwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/nvmlwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/nvmlwrap.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/nvmlwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/rings.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/rings.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/rings.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/rings.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/rings.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' graph/rings.cc: In function 'ncclResult_t ncclBuildRings(int, int*, int, int, int*, int*)': graph/rings.cc:22:80: warning: unused parameter 'prev' [-Wunused-parameter] 22 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { | ~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/profiler.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/profiler.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/profiler.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/profiler.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/profiler.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/proxy.h:10, from include/profiler.h:10, from misc/profiler.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ misc/profiler.cc: In function 'ncclResult_t ncclProfilingRecord(ncclProxyArgs*, int, int, int)': misc/profiler.cc:113:56: warning: unused parameter 'args' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~~~~~~~~~~~~~~~~~~~^~~~ misc/profiler.cc:113:66: warning: unused parameter 'sub' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~ misc/profiler.cc:113:75: warning: unused parameter 'step' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~ misc/profiler.cc:113:85: warning: unused parameter 'state' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/generic.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/generic.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/generic.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/generic.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/generic.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/generic.cc:1: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/ipcsocket.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ipcsocket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ipcsocket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ipcsocket.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ipcsocket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/shmutils.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/shmutils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/shmutils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/shmutils.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/shmutils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from misc/shmutils.cc:8: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling debug.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/debug.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/debug.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c debug.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/debug.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/ibvwrap.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvwrap.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/ibvsymbols.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvsymbols.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvsymbols.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvsymbols.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvsymbols.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/argcheck.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/argcheck.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/argcheck.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/argcheck.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/argcheck.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/info.h:11, from include/argcheck.h:11, from misc/argcheck.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/tuner.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/tuner.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/tuner.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/tuner.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/tuner.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/tuner.h:12, from misc/tuner.cc:14: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling register.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/register.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/register.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c register.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/register.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/info.h:11, from include/argcheck.h:11, from register.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:36, from include/argcheck.h:10: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = void*; size_t = long unsigned int]': register.cc:50:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclReg; size_t = long unsigned int]': register.cc:120:7: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/socket.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/socket.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/sendrecv.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/p2p.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/p2p.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/p2p.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/p2p.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/p2p.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/p2p.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/p2p.cc: In function 'ncclResult_t p2pCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/p2p.cc:107:89: warning: unused parameter 'graph' [-Wunused-parameter] 107 | ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/p2p.cc: In function 'ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc*)': transport/p2p.cc:232:54: warning: unused parameter 'ipcDesc' [-Wunused-parameter] 232 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { | ~~~~~~~~~~~~~^~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pMap(ncclComm*, ncclProxyConnector*, ncclPeerInfo*, ncclPeerInfo*, ncclP2pBuff*, void**, void**)': transport/p2p.cc:301:78: warning: unused parameter 'proxyConn' [-Wunused-parameter] 301 | static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/p2p.cc:406:71: warning: unused parameter 'channelId' [-Wunused-parameter] 406 | struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:456:96: warning: unused parameter 'nranks' [-Wunused-parameter] 456 | static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:495:89: warning: unused parameter 'nranks' [-Wunused-parameter] 495 | ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:615:102: warning: unused parameter 'proxyState' [-Wunused-parameter] 615 | static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:635:104: warning: unused parameter 'proxyState' [-Wunused-parameter] 635 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc:635:150: warning: unused parameter 'respBuff' [-Wunused-parameter] 635 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/p2p.cc:635:164: warning: unused parameter 'respSize' [-Wunused-parameter] 635 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/p2p.cc:635:179: warning: unused parameter 'done' [-Wunused-parameter] 635 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:649:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 649 | static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:681:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 681 | static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: At global scope: transport/p2p.cc:760:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] 760 | }; | ^ transport/p2p.cc:760:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pResources; size_t = long unsigned int]': transport/p2p.cc:342:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pShmProxyInfo; size_t = long unsigned int]': transport/p2p.cc:577:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pCuMemProxyInfo; size_t = long unsigned int]': transport/p2p.cc:604:7: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/utils.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/utils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/utils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/utils.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/utils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling net.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c net.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/net.h:12, from net.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ net.cc: In function 'ncclResult_t ncclNetCheckDeviceVersion(ncclComm*, ncclNet_t*, int)': net.cc:546:57: warning: unused parameter 'comm' [-Wunused-parameter] 546 | ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { | ~~~~~~~~~~~~~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling init_nvtx.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init_nvtx.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init_nvtx.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init_nvtx.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init_nvtx.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling collectives.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/collectives.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/collectives.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/collectives.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/info.h:11, from include/argcheck.h:11, from collectives.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ collectives.cc: In function 'ncclResult_t ncclAllGather(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t)': collectives.cc:82:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 82 | }; | ^ collectives.cc:82:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:82:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] collectives.cc:82:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:82:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclAllReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 106 | }; | ^ collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:106:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 128 | }; | ^ collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:128:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, cudaStream_t)': collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 161 | }; | ^ collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:161:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc: In function 'ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 184 | }; | ^ collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:184:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc: At global scope: collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 202 | }; | ^ collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] collectives.cc:202:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling bootstrap.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/bootstrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/bootstrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c bootstrap.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/bootstrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/bootstrap.h:11, from bootstrap.cc:10: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ bootstrap.cc: In function 'ncclResult_t bootstrapCreateRoot(ncclBootstrapHandle*, bool)': bootstrap.cc:183:75: warning: unused parameter 'idFromEnv' [-Wunused-parameter] 183 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) { | ~~~~~^~~~~~~~~ bootstrap.cc: In function 'ncclResult_t bootstrapInit(ncclBootstrapHandle*, ncclComm*)': bootstrap.cc:251:29: warning: missing initializer for member 'extInfo::nranks' [-Wmissing-field-initializers] 251 | struct extInfo info = { 0 }; | ^ bootstrap.cc:251:29: warning: missing initializer for member 'extInfo::extAddressListenRoot' [-Wmissing-field-initializers] bootstrap.cc:251:29: warning: missing initializer for member 'extInfo::extAddressListen' [-Wmissing-field-initializers] In file included from include/core.h:36, from bootstrap.cc:8: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocketAddress; size_t = long unsigned int]': bootstrap.cc:121:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': bootstrap.cc:188:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapRootArgs; size_t = long unsigned int]': bootstrap.cc:193:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapState; size_t = long unsigned int]': bootstrap.cc:253:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': bootstrap.cc:310:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unexConn; size_t = long unsigned int]': bootstrap.cc:443:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/shm.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/shm.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/shm.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/shm.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/shm.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/shm.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/shm.cc: In function 'ncclResult_t shmCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/shm.cc:51:96: warning: unused parameter 'graph' [-Wunused-parameter] 51 | static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc: In function 'ncclResult_t shmSendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:77:79: warning: unused parameter 'graph' [-Wunused-parameter] 77 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:77:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 77 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:100:79: warning: unused parameter 'graph' [-Wunused-parameter] 100 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:100:107: warning: unused parameter 'myInfo' [-Wunused-parameter] 100 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~ transport/shm.cc:100:136: warning: unused parameter 'peerInfo' [-Wunused-parameter] 100 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/shm.cc:100:211: warning: unused parameter 'channelId' [-Wunused-parameter] 100 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc:100:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 100 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:163:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 163 | struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; | ^ transport/shm.cc:163:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:163:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:136:96: warning: unused parameter 'nranks' [-Wunused-parameter] 136 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/shm.cc:136:108: warning: unused parameter 'rank' [-Wunused-parameter] 136 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:198:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 198 | struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; | ^ transport/shm.cc:198:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:198:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:176:96: warning: unused parameter 'nranks' [-Wunused-parameter] 176 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/shm.cc:176:108: warning: unused parameter 'rank' [-Wunused-parameter] 176 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:232:179: warning: unused parameter 'done' [-Wunused-parameter] 232 | static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:250:179: warning: unused parameter 'done' [-Wunused-parameter] 250 | static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:268:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 268 | static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:284:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 284 | static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/shm.cc: At global scope: transport/shm.cc:421:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] 421 | }; | ^ transport/shm.cc:421:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmSendResources; size_t = long unsigned int]': transport/shm.cc:79:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmRecvResources; size_t = long unsigned int]': transport/shm.cc:102:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmProxyInfo; size_t = long unsigned int]': transport/shm.cc:234:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/net_socket.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_socket.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/net_socket.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketInit(ncclDebugLogger_t)': transport/net_socket.cc:38:50: warning: unused parameter 'logFunction' [-Wunused-parameter] 38 | ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketRegMr(void*, void*, size_t, int, void**)': transport/net_socket.cc:538:39: warning: unused parameter 'comm' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:538:51: warning: unused parameter 'data' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:538:64: warning: unused parameter 'size' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~~^~~~ transport/net_socket.cc:538:87: warning: unused parameter 'mhandle' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { | ~~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketDeregMr(void*, void*)': transport/net_socket.cc:541:41: warning: unused parameter 'comm' [-Wunused-parameter] 541 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~ transport/net_socket.cc:541:53: warning: unused parameter 'mhandle' [-Wunused-parameter] 541 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIsend(void*, void*, int, int, void*, void**)': transport/net_socket.cc:543:75: warning: unused parameter 'tag' [-Wunused-parameter] 543 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~^~~ transport/net_socket.cc:543:86: warning: unused parameter 'mhandle' [-Wunused-parameter] 543 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIrecv(void*, int, void**, int*, int*, void**, void**)': transport/net_socket.cc:549:86: warning: unused parameter 'tags' [-Wunused-parameter] 549 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~^~~~ transport/net_socket.cc:549:99: warning: unused parameter 'mhandles' [-Wunused-parameter] 549 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIflush(void*, int, void**, int*, void**, void**)': transport/net_socket.cc:556:40: warning: unused parameter 'recvComm' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~^~~~~~~~ transport/net_socket.cc:556:54: warning: unused parameter 'n' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~^ transport/net_socket.cc:556:64: warning: unused parameter 'data' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~ transport/net_socket.cc:556:75: warning: unused parameter 'sizes' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~^~~~~ transport/net_socket.cc:556:89: warning: unused parameter 'mhandles' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc:556:106: warning: unused parameter 'request' [-Wunused-parameter] 556 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketListenComm; size_t = long unsigned int]': transport/net_socket.cc:294:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketComm; size_t = long unsigned int]': transport/net_socket.cc:323:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': transport/net_socket.cc:373:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketTask; size_t = long unsigned int]': transport/net_socket.cc:435:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/device_table.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/all_gather.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/all_reduce.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies src/device/common.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/reduce_scatter.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/paths.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/paths.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/paths.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/paths.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/paths.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/graph.h:11, from graph/paths.cc:8: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/paths.cc: In function 'ncclResult_t ncclTopoCheckMNNVL(ncclTopoSystem*, ncclPeerInfo*, ncclPeerInfo*, int*)': graph/paths.cc:349:56: warning: unused parameter 'system' [-Wunused-parameter] 349 | ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36, from graph/paths.cc:7: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoLinkList; size_t = long unsigned int]': graph/paths.cc:38:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/paths.cc:647:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/paths.cc:648:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling proxy.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/proxy.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/proxy.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c proxy.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/proxy.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from proxy.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ proxy.cc: In function 'void ncclDumpProxyState(int)': proxy.cc:746:29: warning: unused parameter 'signal' [-Wunused-parameter] 746 | void ncclDumpProxyState(int signal) { | ~~~~^~~~~~ proxy.cc: In function 'ncclResult_t ncclProxyConnect(ncclComm*, int, int, int, ncclProxyConnector*)': proxy.cc:999:35: warning: missing initializer for member 'ncclProxyInitReq::send' [-Wmissing-field-initializers] 999 | struct ncclProxyInitReq req = {0}; | ^ proxy.cc:999:35: warning: missing initializer for member 'ncclProxyInitReq::tpLocalRank' [-Wmissing-field-initializers] proxy.cc:999:35: warning: missing initializer for member 'ncclProxyInitReq::tpRank' [-Wmissing-field-initializers] proxy.cc:999:35: warning: missing initializer for member 'ncclProxyInitReq::sameProcess' [-Wmissing-field-initializers] proxy.cc:1006:37: warning: missing initializer for member 'ncclProxyInitResp::devShmPath' [-Wmissing-field-initializers] 1006 | struct ncclProxyInitResp resp = {0}; | ^ proxy.cc: In function 'ncclResult_t ncclProxyCallBlockingUDS(ncclComm*, int, int, void*, int, void*, int, int*)': proxy.cc:1030:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1030 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1030:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t ncclPollProxyResponse(ncclComm*, ncclProxyConnector*, void*, void*)': proxy.cc:1128:41: warning: missing initializer for member 'ncclProxyRpcResponseHeader::res' [-Wmissing-field-initializers] 1128 | ncclProxyRpcResponseHeader resp = {0}; | ^ proxy.cc:1128:41: warning: missing initializer for member 'ncclProxyRpcResponseHeader::respSize' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t proxyGetFd(ncclProxyState*, int, void*, uint64_t)': proxy.cc:1275:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1275 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1275:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t proxyUDSRecvReq(ncclProxyState*, int)': proxy.cc:1554:76: warning: unused parameter 'reqFd' [-Wunused-parameter] 1554 | static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) { | ~~~~^~~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclExpectedProxyResponse; size_t = long unsigned int]': proxy.cc:85:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPool; size_t = long unsigned int]': proxy.cc:199:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyConnection; size_t = long unsigned int]': proxy.cc:905:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': proxy.cc:983:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyOps; size_t = long unsigned int]': proxy.cc:984:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = void*; size_t = long unsigned int]': proxy.cc:985:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyAsyncOp; size_t = long unsigned int]': proxy.cc:1357:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': proxy.cc:1365:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyState; size_t = long unsigned int]': proxy.cc:1611:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/broadcast.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/tuning.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/tuning.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/tuning.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/tuning.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/tuning.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from graph/tuning.cc:8: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies build/obj/device/gensrc/reduce.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport.cc: In function 'int ncclTransportCollNetSetup(ncclComm*, ncclTopoGraph*, ncclChannel*, int, int, int, int, ncclConnect*)': transport.cc:278:139: warning: unused parameter 'masterPeer' [-Wunused-parameter] 278 | int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) { | ~~~~^~~~~~~~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclConnect*; size_t = long unsigned int]': transport.cc:83:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclConnect; size_t = long unsigned int]': transport.cc:107:31: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTransportCollNetSetup(ncclComm*, ncclTopoGraph*, ncclChannel*, int, int, int, int, ncclConnect*)::; size_t = long unsigned int]': transport.cc:314:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/nvls.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/nvls.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/nvls.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/nvls.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/nvls.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/nvls.cc:9: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/nvls.cc: In function 'ncclResult_t nvlsCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/nvls.cc:29:62: warning: unused parameter 'topo' [-Wunused-parameter] 29 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:29:90: warning: unused parameter 'graph' [-Wunused-parameter] 29 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc:29:118: warning: unused parameter 'info1' [-Wunused-parameter] 29 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc:29:146: warning: unused parameter 'info2' [-Wunused-parameter] 29 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc: In function 'ncclResult_t nvlsSendFree(ncclConnector*)': transport/nvls.cc:35:49: warning: unused parameter 'send' [-Wunused-parameter] 35 | ncclResult_t nvlsSendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsRecvFree(ncclConnector*)': transport/nvls.cc:39:49: warning: unused parameter 'recv' [-Wunused-parameter] 39 | ncclResult_t nvlsRecvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: At global scope: transport/nvls.cc:48:1: warning: missing initializer for member 'ncclTransportComm::proxyRegister' [-Wmissing-field-initializers] 48 | }; | ^ transport/nvls.cc:48:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] transport/nvls.cc:48:1: warning: missing initializer for member 'ncclTransportComm::proxyRegister' [-Wmissing-field-initializers] transport/nvls.cc:48:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] transport/nvls.cc: In function 'ncclResult_t nvlsGroupCreate(ncclComm*, CUmulticastObjectProp*, int, unsigned int, CUmemGenericAllocationHandle*, char*)': transport/nvls.cc:50:47: warning: unused parameter 'comm' [-Wunused-parameter] 50 | ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupUnmapMem(ncclComm*, size_t, void*, CUmemGenericAllocationHandle*, void*, CUmemGenericAllocationHandle*)': transport/nvls.cc:116:49: warning: unused parameter 'comm' [-Wunused-parameter] 116 | ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { | ~~~~~~~~~~~~~~~~~^~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNvlsSharedRes; size_t = long unsigned int]': transport/nvls.cc:357:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = localRegData; size_t = long unsigned int]': transport/nvls.cc:498:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bool; size_t = long unsigned int]': transport/nvls.cc:742:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = graphRegData; size_t = long unsigned int]': transport/nvls.cc:743:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling group.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/group.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/group.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c group.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/group.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/group.h:11, from group.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPreconnectJob; size_t = long unsigned int]': group.cc:404:7: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bool; size_t = long unsigned int]': group.cc:442:9: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/net_ib.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_ib.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_ib.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_ib.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_ib.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/net.h:12, from transport/net_ib.cc:10: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/net_ib.cc: In function 'ncclResult_t ncclIbInit(ncclDebugLogger_t)': transport/net_ib.cc:422:43: warning: unused parameter 'logFunction' [-Wunused-parameter] 422 | ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_ib.cc: In function 'ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase*, void*, size_t, int, uint64_t, int, ibv_mr**)': transport/net_ib.cc:1469:97: warning: unused parameter 'type' [-Wunused-parameter] 1469 | ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, ibv_mr** mhandle) { | ~~~~^~~~ In file included from include/core.h:36, from transport/net_ib.cc:8: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclIbListenComm; size_t = long unsigned int]': transport/net_ib.cc:1030:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/connect.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/connect.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/connect.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/connect.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/connect.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from graph/connect.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/connect.cc: In function 'ncclResult_t connectTrees(ncclComm*, int*, int*, int*, int*)': graph/connect.cc:137:119: warning: unused parameter 'treePatterns' [-Wunused-parameter] 137 | static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { | ~~~~~^~~~~~~~~~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/connect.cc:181:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Dependencies src/device/onerank.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/search.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/search.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/search.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/search.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/search.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from graph/search.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/search.cc: In function 'float getTotalBw(ncclTopoSystem*, ncclTopoNode*)': graph/search.cc:29:48: warning: unused parameter 'system' [-Wunused-parameter] 29 | static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': graph/xml.h:73:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/search.cc:450:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/xml.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/xml.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/xml.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/xml.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/xml.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' graph/xml.cc: In function 'ncclResult_t ncclTopoGetXmlFromCpu(ncclXmlNode*, ncclXml*)': graph/xml.cc:460:81: warning: unused parameter 'xml' [-Wunused-parameter] 460 | ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { | ~~~~~~~~~~~~~~~~^~~ In file included from include/core.h:36, from graph/xml.cc:13: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': graph/xml.h:73:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling misc/gdrwrap.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/gdrwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/gdrwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/gdrwrap.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/gdrwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/coll_net.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/coll_net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/coll_net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/coll_net.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/coll_net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/coll_net.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/coll_net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/coll_net.cc:138:65: warning: unused parameter 'topo' [-Wunused-parameter] 138 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc:138:93: warning: unused parameter 'graph' [-Wunused-parameter] 138 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:138:121: warning: unused parameter 'info1' [-Wunused-parameter] 138 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:138:149: warning: unused parameter 'info2' [-Wunused-parameter] 138 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:155:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 155 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:155:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:155:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:154:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 154 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc:154:163: warning: unused parameter 'connectInfo' [-Wunused-parameter] 154 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:176:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 176 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:176:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:176:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:175:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 175 | static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t sendFree(ncclConnector*)': transport/coll_net.cc:299:52: warning: unused parameter 'send' [-Wunused-parameter] 299 | static ncclResult_t sendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvFree(ncclConnector*)': transport/coll_net.cc:303:52: warning: unused parameter 'recv' [-Wunused-parameter] 303 | static ncclResult_t recvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:307:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 307 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/coll_net.cc:307:159: warning: unused parameter 'respSize' [-Wunused-parameter] 307 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/coll_net.cc:307:174: warning: unused parameter 'done' [-Wunused-parameter] 307 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:417:174: warning: unused parameter 'done' [-Wunused-parameter] 417 | static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:442:176: warning: unused parameter 'done' [-Wunused-parameter] 442 | static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:516:176: warning: unused parameter 'done' [-Wunused-parameter] 516 | static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'constexpr int calcStepsPerGroup(int)': transport/coll_net.cc:672:44: warning: unused parameter 'nGroups' [-Wunused-parameter] 672 | static constexpr int calcStepsPerGroup(int nGroups) { | ~~~~^~~~~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendResources; size_t = long unsigned int]': transport/coll_net.cc:312:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sharedResources; size_t = long unsigned int]': transport/coll_net.cc:335:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char (*)[128]; size_t = long unsigned int]': transport/coll_net.cc:348:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvResources; size_t = long unsigned int]': transport/coll_net.cc:422:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': transport/coll_net.cc:1282:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCollNetSetup(ncclComm_t, ncclComm_t, ncclTopoGraph**)::collnetShareInfo; size_t = long unsigned int]': transport/coll_net.cc:1301:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCollNetSharedRes; size_t = long unsigned int]': transport/coll_net.cc:1358:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned char [4][10]; size_t = long unsigned int]': transport/coll_net.cc:1391:7: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:219:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/coll_net.cc:475:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling channel.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/channel.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/channel.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c channel.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/channel.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/channel.h:9, from channel.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclChannelPeer; size_t = long unsigned int]': channel.cc:31:7: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclDevChannelPeer*; size_t = long unsigned int]': channel.cc:47:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling transport/net.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from transport/net.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/net.cc:145:93: warning: unused parameter 'graph' [-Wunused-parameter] 145 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:175:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 175 | struct setupReq req = { 0 }; | ^ transport/net.cc:175:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:175:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:175:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:175:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:175:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:175:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:175:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:213:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 213 | struct setupReq req = { 0 }; | ^ transport/net.cc:213:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:213:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:213:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:213:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:213:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:213:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:213:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t sendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:289:93: warning: unused parameter 'nranks' [-Wunused-parameter] 289 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/net.cc:289:105: warning: unused parameter 'rank' [-Wunused-parameter] 289 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t recvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:396:93: warning: unused parameter 'nranks' [-Wunused-parameter] 396 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/net.cc:396:105: warning: unused parameter 'rank' [-Wunused-parameter] 396 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/net.cc:577:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 577 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/net.cc: At global scope: transport/net.cc:1460:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] 1460 | }; | ^ transport/net.cc:1460:1: warning: missing initializer for member 'ncclTransportComm::proxyDeregister' [-Wmissing-field-initializers] In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = connectMap; size_t = long unsigned int]': transport/net.cc:297:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer*; size_t = long unsigned int]': transport/net.cc:504:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer; size_t = long unsigned int]': transport/net.cc:508:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendNetResources; size_t = long unsigned int]': transport/net.cc:582:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvNetResources; size_t = long unsigned int]': transport/net.cc:615:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetDeviceHandle_v7_t; size_t = long unsigned int]': transport/net.cc:654:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedNetComms; size_t = long unsigned int]': transport/net.cc:685:9: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:219:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/net.cc:765:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling graph/topo.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/topo.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/topo.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/topo.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/topo.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/graph.h:11, from graph/topo.cc:8: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/topo.cc: In function 'ncclResult_t pciPathToInt64(char*, int, int, int64_t*)': graph/topo.cc:33:57: warning: unused parameter 'minOffset' [-Wunused-parameter] 33 | ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { | ~~~~^~~~~~~~~ graph/topo.cc: In function 'ncclResult_t ncclTopoAddGpu(ncclXmlNode*, ncclTopoSystem*, ncclTopoNode*)': graph/topo.cc:386:80: warning: unused parameter 'system' [-Wunused-parameter] 386 | ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:36, from graph/topo.cc:7: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': graph/xml.h:73:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/topo.cc:202:7: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoSystem; size_t = long unsigned int]': graph/topo.cc:654:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/topo.cc:808:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling enqueue.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enqueue.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enqueue.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enqueue.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enqueue.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/enqueue.h:10, from enqueue.cc:7: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ enqueue.cc: In function 'ncclResult_t ncclInitKernelsForDevice(int, size_t*)': enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::constSizeBytes' [-Wmissing-field-initializers] 33 | cudaFuncAttributes attr = {0}; | ^ enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::localSizeBytes' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::maxThreadsPerBlock' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::numRegs' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::ptxVersion' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::binaryVersion' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::cacheModeCA' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::maxDynamicSharedSizeBytes' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::preferredShmemCarveout' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::clusterDimMustBeSet' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterWidth' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterHeight' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterDepth' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::clusterSchedulingPolicyPreference' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::nonPortableClusterSizeAllowed' [-Wmissing-field-initializers] enqueue.cc:33:35: warning: missing initializer for member 'cudaFuncAttributes::reserved' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t addProxyOpIfNeeded(ncclComm*, ncclKernelPlan*, ncclProxyOp*)': enqueue.cc:79:86: warning: unused parameter 'plan' [-Wunused-parameter] 79 | static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ enqueue.cc: In function 'ncclResult_t cleanupIpc(ncclComm*, ncclCommCallback*)': enqueue.cc:235:49: warning: unused parameter 'comm' [-Wunused-parameter] 235 | static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) { | ~~~~~~~~~~~~~~~~~^~~~ enqueue.cc: In function 'ncclResult_t ncclLaunchKernel(ncclComm*, ncclKernelPlan*)': enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::gridDimY' [-Wmissing-field-initializers] 1411 | CUlaunchConfig launchConfig = {0}; | ^ enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::gridDimZ' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::blockDimX' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::blockDimY' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::blockDimZ' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::sharedMemBytes' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::hStream' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::attrs' [-Wmissing-field-initializers] enqueue.cc:1411:37: warning: missing initializer for member 'CUlaunchConfig_st::numAttrs' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Compiling init.cc > /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init.o` g++ -I. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init.cc -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' In file included from include/collectives.h:12, from include/comm.h:12, from include/channel.h:9, from init.cc:8: include/device.h: In function 'constexpr int ncclMaxKernelArgsSize(int)': include/device.h:421:81: warning: unused parameter 'cudaArch' [-Wunused-parameter] 421 | __host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/device.h:473:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 473 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/device.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/device.h:474:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 474 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ init.cc: In function 'ncclResult_t commGetSplitInfo(ncclComm*, ncclComm*, int, int, int*, int*, int*)': init.cc:1309:55: warning: unused parameter 'comm' [-Wunused-parameter] 1309 | static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { | ~~~~~~~~~~~~~~~~~^~~~ init.cc: At global scope: init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 1692 | }; | ^ init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] init.cc:1692:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] init.cc: In function 'ncclResult_t ncclCommInitAll(ncclComm**, int, const int*)': init.cc:1719:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::description' [-Wmissing-field-initializers] 1719 | }; | ^ init.cc:1719:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1719:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::offset' [-Wmissing-field-initializers] init.cc:1719:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::semantics' [-Wmissing-field-initializers] init.cc:1719:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_v1::reserved' [-Wmissing-field-initializers] init.cc: In function 'const char* ncclGetLastError(ncclComm_t)': init.cc:2114:41: warning: unused parameter 'comm' [-Wunused-parameter] 2114 | const char* ncclGetLastError(ncclComm_t comm) { | ~~~~~~~~~~~^~~~ In file included from include/core.h:36, from include/p2p.h:15, from include/comm.h:11: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': init.cc:367:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedResources; size_t = long unsigned int]': init.cc:375:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': init.cc:379:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPeerInfo; size_t = long unsigned int]': init.cc:729:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = initTransportsRank(ncclComm*, ncclComm*, uint64_t*)::allGatherInfo; size_t = long unsigned int]': init.cc:907:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNodeRanks; size_t = long unsigned int]': init.cc:966:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoRanks*; size_t = long unsigned int]': init.cc:1003:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclComm; size_t = long unsigned int]': init.cc:1650:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned int; size_t = long unsigned int]': init.cc:1651:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommInitRankAsyncJob; size_t = long unsigned int]': init.cc:1661:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommFinalizeAsyncJob; size_t = long unsigned int]': init.cc:1887:3: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:219:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = void; size_t = long unsigned int]' init.cc:446:5: required from here include/alloc.h:51:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:51:79: warning: unused parameter 'line' [-Wunused-parameter] 51 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/device_table.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4448 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling src/device/common.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, used 1 barriers, 4416 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, used 1 barriers, 4416 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, used 1 barriers, 4416 bytes cmem[0] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, used 1 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, used 1 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z21ncclDevKernel_Generic24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, used 1 barriers ptxas info : Function properties for _Z15ncclDevFunc_Nopv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1041 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_i64_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1037 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_i32_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/sendrecv.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1039 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 252 bytes spill stores, 392 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 264 bytes stack frame, 304 bytes spill stores, 400 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1039 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 248 bytes spill stores, 376 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 264 bytes stack frame, 304 bytes spill stores, 400 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1039 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 248 bytes spill stores, 376 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 264 bytes stack frame, 304 bytes spill stores, 400 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1039 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 220 bytes spill stores, 416 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 288 bytes stack frame, 372 bytes spill stores, 456 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1039 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE 256 bytes stack frame, 672 bytes spill stores, 1184 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 408 bytes stack frame, 820 bytes spill stores, 1196 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1039 bytes gmem ptxas info : Compiling entry function '_Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z22ncclDevKernel_SendRecv24ncclDevKernelArgsStorageILm4096EE 240 bytes stack frame, 632 bytes spill stores, 984 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Function properties for _Z20ncclDevFunc_SendRecvv 424 bytes stack frame, 844 bytes spill stores, 1208 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 432 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 71 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 472 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 288 bytes stack frame, 344 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f64_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f64_RING_LL128v 416 bytes stack frame, 284 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 480 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 296 bytes stack frame, 360 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 472 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 527 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 71 registers, used 16 barriers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f16_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f16_RING_LL128v 440 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 525 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 525 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 525 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 525 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 525 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 344 bytes stack frame, 464 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 525 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 16 barriers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_f32_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_f32_RING_LL128v 432 bytes stack frame, 304 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 539 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 539 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 539 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 539 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 539 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 344 bytes stack frame, 464 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 539 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 16 barriers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u32_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u32_RING_LL128v 456 bytes stack frame, 336 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 256 bytes stack frame, 288 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 296 bytes stack frame, 360 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 472 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z37ncclDevKernel_Reduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 71 registers, used 16 barriers ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Sum_bf16_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_bf16_RING_LL128v 440 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Sum_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 432 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 472 bytes stack frame, 400 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 344 bytes stack frame, 476 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z36ncclDevKernel_Reduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Sum_u64_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u64_RING_LL128v 424 bytes stack frame, 292 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Sum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 544 bytes spill stores, 776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 544 bytes spill stores, 776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 544 bytes spill stores, 776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 528 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 908 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 360 bytes stack frame, 444 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 812 bytes spill stores, 1196 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f64_RING_SIMPLEv 328 bytes stack frame, 384 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f64_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f64_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 856 bytes spill stores, 1176 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1981 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1981 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1981 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1981 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 520 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 560 bytes spill stores, 800 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1981 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 416 bytes stack frame, 652 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 684 bytes spill stores, 1036 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1981 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f32_RING_SIMPLEv 368 bytes stack frame, 560 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f32_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 728 bytes spill stores, 968 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 660 bytes spill stores, 964 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 660 bytes spill stores, 964 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 660 bytes spill stores, 964 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 328 bytes stack frame, 384 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 528 bytes stack frame, 424 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 716 bytes spill stores, 1084 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 440 bytes stack frame, 692 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 520 bytes stack frame, 404 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 856 bytes spill stores, 1272 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_SIMPLEv 392 bytes stack frame, 656 bytes spill stores, 1144 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 908 bytes spill stores, 1272 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 528 bytes spill stores, 748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 528 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 724 bytes spill stores, 1112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 416 bytes stack frame, 676 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 512 bytes stack frame, 400 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 828 bytes spill stores, 1268 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u64_RING_SIMPLEv 384 bytes stack frame, 564 bytes spill stores, 1020 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u64_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u64_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 872 bytes spill stores, 1196 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2037 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 496 bytes spill stores, 696 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2037 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 496 bytes spill stores, 696 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2037 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 496 bytes spill stores, 696 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2037 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 520 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 560 bytes spill stores, 800 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2037 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 424 bytes stack frame, 648 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 680 bytes spill stores, 1028 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2037 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_u32_RING_SIMPLEv 368 bytes stack frame, 560 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u32_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 724 bytes spill stores, 956 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 496 bytes stack frame, 380 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 552 bytes spill stores, 784 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 552 bytes spill stores, 784 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 496 bytes stack frame, 372 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 552 bytes spill stores, 784 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 336 bytes stack frame, 400 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 528 bytes stack frame, 416 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 596 bytes spill stores, 920 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 376 bytes stack frame, 572 bytes spill stores, 1060 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 520 bytes stack frame, 412 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 800 bytes spill stores, 1164 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_SIMPLEv 344 bytes stack frame, 416 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 856 bytes spill stores, 1136 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2057 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 536 bytes spill stores, 740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2057 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 496 bytes stack frame, 376 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 536 bytes spill stores, 740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2057 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 496 bytes stack frame, 376 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 536 bytes spill stores, 740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2057 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 528 bytes stack frame, 428 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 572 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2057 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 408 bytes stack frame, 668 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 728 bytes spill stores, 1064 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2057 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_SIMPLEv 384 bytes stack frame, 564 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 748 bytes spill stores, 948 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2001 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 480 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2001 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 480 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2001 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 480 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2001 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 528 bytes stack frame, 428 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 536 bytes spill stores, 780 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2001 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 416 bytes stack frame, 668 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 700 bytes spill stores, 1048 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2001 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_SIMPLEv 384 bytes stack frame, 564 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 752 bytes spill stores, 956 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 240 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 240 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 240 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 264 bytes stack frame, 316 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 464 bytes stack frame, 340 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 360 bytes stack frame, 580 bytes spill stores, 996 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i32_RING_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i32_RING_LL128v 456 bytes stack frame, 336 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 588 bytes spill stores, 828 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 788 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 512 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 788 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 536 bytes stack frame, 424 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 560 bytes spill stores, 852 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 448 bytes stack frame, 724 bytes spill stores, 1316 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 536 bytes stack frame, 432 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 480 bytes stack frame, 788 bytes spill stores, 1140 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f64_RING_SIMPLEv 408 bytes stack frame, 724 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f64_RING_LL128v 472 bytes stack frame, 324 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f64_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 844 bytes spill stores, 1072 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 504 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 484 bytes spill stores, 700 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 504 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 544 bytes stack frame, 476 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 852 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 384 bytes stack frame, 560 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 792 bytes spill stores, 1172 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_f16_RING_SIMPLEv 336 bytes stack frame, 408 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_f16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Prod_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Prod_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 840 bytes spill stores, 1140 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 320 bytes stack frame, 384 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 320 bytes stack frame, 384 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 320 bytes stack frame, 384 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 360 bytes stack frame, 420 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 528 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 720 bytes spill stores, 1092 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 424 bytes stack frame, 708 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 536 bytes stack frame, 456 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 876 bytes spill stores, 1296 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_SIMPLEv 408 bytes stack frame, 628 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LL128v 504 bytes stack frame, 404 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 932 bytes spill stores, 1332 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2061 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2061 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2061 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2061 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 336 bytes stack frame, 392 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 520 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 872 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2061 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 448 bytes stack frame, 680 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 676 bytes spill stores, 1024 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2061 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_SIMPLEv 368 bytes stack frame, 548 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 740 bytes spill stores, 972 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 512 bytes stack frame, 416 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 512 bytes stack frame, 416 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 304 bytes stack frame, 340 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 512 bytes stack frame, 416 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 344 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 528 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 720 bytes spill stores, 1092 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 432 bytes stack frame, 700 bytes spill stores, 1232 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 528 bytes stack frame, 432 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 860 bytes spill stores, 1280 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_SIMPLEv 384 bytes stack frame, 588 bytes spill stores, 1092 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LL128v 504 bytes stack frame, 404 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 908 bytes spill stores, 1304 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 424 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 424 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 424 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 464 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 73 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 432 bytes stack frame, 812 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 448 bytes stack frame, 304 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 541 bytes gmem ptxas info : Compiling entry function '_Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z35ncclDevKernel_Reduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 71 registers, used 16 barriers ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Sum_u8_RING_SIMPLEv 336 bytes stack frame, 652 bytes spill stores, 1020 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Sum_u8_RING_LL128v 416 bytes stack frame, 284 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_Reduce_Sum_u8_RING_LLv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 280 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 280 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 280 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 336 bytes stack frame, 392 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 520 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 580 bytes spill stores, 868 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 448 bytes stack frame, 688 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 700 bytes spill stores, 1048 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_SIMPLEv 376 bytes stack frame, 568 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 752 bytes spill stores, 984 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 512 bytes spill stores, 736 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 512 bytes spill stores, 736 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 512 bytes spill stores, 736 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 336 bytes stack frame, 396 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 528 bytes stack frame, 416 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 400 bytes stack frame, 560 bytes spill stores, 876 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 416 bytes stack frame, 696 bytes spill stores, 1192 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 520 bytes stack frame, 404 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 800 bytes spill stores, 1180 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1989 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f32_RING_SIMPLEv 376 bytes stack frame, 576 bytes spill stores, 1052 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f32_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f32_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 868 bytes spill stores, 1180 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 624 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 624 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 312 bytes stack frame, 364 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 624 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 344 bytes stack frame, 420 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 644 bytes spill stores, 980 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 504 bytes stack frame, 796 bytes spill stores, 1484 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 496 bytes stack frame, 388 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1128 bytes spill stores, 1596 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z42ncclDevKernel_ReduceScatter_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 83 registers, used 16 barriers ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u8_RING_SIMPLEv 472 bytes stack frame, 844 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Sum_u8_RING_LL128v 456 bytes stack frame, 348 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_ReduceScatter_Sum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_ReduceScatter_Sum_u8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1176 bytes spill stores, 1532 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 312 bytes stack frame, 368 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 624 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 312 bytes stack frame, 368 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 624 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 312 bytes stack frame, 368 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 624 bytes spill stores, 896 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 344 bytes stack frame, 420 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 432 bytes stack frame, 652 bytes spill stores, 988 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 504 bytes stack frame, 812 bytes spill stores, 1492 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 512 bytes stack frame, 412 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1152 bytes spill stores, 1612 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Prod_u8_RING_SIMPLEv 472 bytes stack frame, 856 bytes spill stores, 1356 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Prod_u8_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Prod_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1196 bytes spill stores, 1544 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 512 bytes stack frame, 416 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 512 bytes stack frame, 416 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 512 bytes stack frame, 416 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 668 bytes spill stores, 1000 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 528 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 712 bytes spill stores, 1084 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 408 bytes stack frame, 660 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 528 bytes stack frame, 432 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 836 bytes spill stores, 1256 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 3039 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, used 16 barriers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_RING_SIMPLEv 368 bytes stack frame, 560 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u64_RING_LL128v 488 bytes stack frame, 404 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u64_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u64_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 884 bytes spill stores, 1252 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 572 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 640 bytes spill stores, 884 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 572 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 640 bytes spill stores, 884 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 572 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 640 bytes spill stores, 884 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 408 bytes stack frame, 672 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 520 bytes stack frame, 412 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 456 bytes stack frame, 700 bytes spill stores, 1024 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 696 bytes stack frame, 1124 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1164 bytes spill stores, 1628 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u8_RING_SIMPLEv 488 bytes stack frame, 844 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_MinMax_u8_RING_LL128v 464 bytes stack frame, 352 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_MinMax_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_MinMax_u8_COLLNET_DIRECT_SIMPLEv 736 bytes stack frame, 1406 bytes spill stores, 1804 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 520 bytes spill stores, 728 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 484 bytes spill stores, 688 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 460 bytes spill stores, 668 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 536 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 536 bytes spill stores, 836 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 400 bytes stack frame, 652 bytes spill stores, 1152 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 528 bytes stack frame, 432 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 720 bytes spill stores, 1072 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2009 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_PreMulSum_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 760 bytes spill stores, 964 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 812 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 812 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 812 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 536 bytes stack frame, 416 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 652 bytes spill stores, 964 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 424 bytes stack frame, 688 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 536 bytes stack frame, 428 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 704 bytes spill stores, 1032 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 3057 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_RING_SIMPLEv 384 bytes stack frame, 632 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u64_RING_LL128v 496 bytes stack frame, 388 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u64_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u64_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 748 bytes spill stores, 984 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 488 bytes stack frame, 348 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 676 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 488 bytes stack frame, 360 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 676 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 488 bytes stack frame, 360 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 676 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 336 bytes stack frame, 400 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 512 bytes stack frame, 376 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 520 bytes spill stores, 832 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 416 bytes stack frame, 672 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 512 bytes stack frame, 392 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 680 bytes spill stores, 1032 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 3045 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_RING_SIMPLEv 360 bytes stack frame, 496 bytes spill stores, 940 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_u32_RING_LL128v 464 bytes stack frame, 344 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_u32_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 724 bytes spill stores, 960 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2033 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2033 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2033 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2033 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 520 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 580 bytes spill stores, 868 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2033 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 424 bytes stack frame, 648 bytes spill stores, 1156 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 440 bytes stack frame, 676 bytes spill stores, 1024 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 3027 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, used 16 barriers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_RING_SIMPLEv 368 bytes stack frame, 560 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_u32_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_u32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_u32_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_u32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 724 bytes spill stores, 956 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 544 bytes spill stores, 776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 544 bytes spill stores, 776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 264 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 544 bytes spill stores, 776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 528 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 584 bytes spill stores, 908 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 360 bytes stack frame, 444 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 812 bytes spill stores, 1196 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2955 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, used 16 barriers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_RING_SIMPLEv 328 bytes stack frame, 384 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f64_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f64_NVLS_SIMPLEv 400 bytes stack frame, 764 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f64_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 856 bytes spill stores, 1176 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1977 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1977 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1977 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 532 bytes spill stores, 732 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1977 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 520 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 560 bytes spill stores, 800 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1977 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 416 bytes stack frame, 652 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 684 bytes spill stores, 1036 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2943 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, used 16 barriers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_RING_SIMPLEv 368 bytes stack frame, 560 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f32_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f32_NVLS_SIMPLEv 400 bytes stack frame, 772 bytes spill stores, 1520 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f32_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 728 bytes spill stores, 968 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 644 bytes spill stores, 900 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 644 bytes spill stores, 900 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 644 bytes spill stores, 900 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 352 bytes stack frame, 416 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 520 bytes stack frame, 396 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 668 bytes spill stores, 1016 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 496 bytes stack frame, 852 bytes spill stores, 1528 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 520 bytes stack frame, 408 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1132 bytes spill stores, 1592 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_SIMPLEv 480 bytes stack frame, 856 bytes spill stores, 1364 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LL128v 472 bytes stack frame, 348 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_PreMulSum_u8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z60ncclDevFunc_ReduceScatter_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1192 bytes spill stores, 1536 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 528 bytes stack frame, 428 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 528 bytes spill stores, 828 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 384 bytes stack frame, 560 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 792 bytes spill stores, 1172 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2045 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_Prod_bf16_RING_SIMPLEv 336 bytes stack frame, 408 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Prod_bf16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_ReduceScatter_Prod_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_ReduceScatter_Prod_bf16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 840 bytes spill stores, 1140 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 232 bytes stack frame, 268 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 432 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 232 bytes stack frame, 268 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 432 bytes stack frame, 320 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 232 bytes stack frame, 268 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 432 bytes stack frame, 320 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 288 bytes stack frame, 332 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 376 bytes stack frame, 608 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 456 bytes stack frame, 388 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_i64_RING_SIMPLEv 304 bytes stack frame, 368 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i64_RING_LL128v 440 bytes stack frame, 320 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_i64_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 224 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 440 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 224 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 440 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 224 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 440 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 280 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 456 bytes stack frame, 332 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 352 bytes stack frame, 572 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 456 bytes stack frame, 336 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u64_RING_SIMPLEv 256 bytes stack frame, 272 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u64_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4023 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_i64_NVLS_TREE_SIMPLEv 504 bytes stack frame, 1260 bytes spill stores, 2504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_i64_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 546 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 546 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 546 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 546 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 546 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 344 bytes stack frame, 460 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 546 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_SumPostDiv_u32_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u32_RING_LL128v 456 bytes stack frame, 336 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_SumPostDiv_u32_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 504 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 272 bytes stack frame, 292 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 484 bytes spill stores, 700 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 496 bytes stack frame, 384 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 368 bytes stack frame, 504 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 336 bytes stack frame, 404 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 544 bytes stack frame, 476 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 392 bytes stack frame, 540 bytes spill stores, 852 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 1985 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 384 bytes stack frame, 560 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 792 bytes spill stores, 1172 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2955 bytes gmem ptxas info : Compiling entry function '_Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z43ncclDevKernel_ReduceScatter_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, used 16 barriers ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_RING_SIMPLEv 336 bytes stack frame, 408 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_Sum_f16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_ReduceScatter_Sum_f16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_f16_NVLS_SIMPLEv 424 bytes stack frame, 924 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_ReduceScatter_Sum_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 840 bytes spill stores, 1140 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 0 bytes gmem ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4007 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_i32_NVLS_TREE_SIMPLEv 512 bytes stack frame, 1308 bytes spill stores, 3408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_i32_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_gather.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2029 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 360 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 592 bytes spill stores, 872 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2029 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 360 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 592 bytes spill stores, 872 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2029 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 320 bytes stack frame, 360 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 360 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 592 bytes spill stores, 872 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2029 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 368 bytes stack frame, 428 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 392 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 648 bytes spill stores, 1004 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2029 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 464 bytes stack frame, 976 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 384 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1056 bytes spill stores, 1496 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4011 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z31ncclDevKernel_AllGather_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers ptxas info : Function properties for _Z33ncclDevFunc_AllGather_RING_SIMPLEv 440 bytes stack frame, 900 bytes spill stores, 1840 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_AllGather_RING_LL128v 344 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_AllGather_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z33ncclDevFunc_AllGather_NVLS_SIMPLEv 496 bytes stack frame, 2368 bytes spill stores, 4748 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllGather_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1120 bytes spill stores, 1460 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling src/device/onerank.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, used 0 barriers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_50' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, used 0 barriers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_60' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, used 0 barriers, 353 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_61' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, used 0 barriers, 353 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 42 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 48 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_70' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 58 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 58 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 66 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_80' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, used 0 barriers, 385 bytes cmem[0] ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : Overriding global maxrregcount 96 with entry-specific value 128 computed using thread count ptxas info : 0 bytes gmem ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIdEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIfEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 65 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI13__nv_bfloat16EEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumI6__halfEEEvPvS4_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumImEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIlEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIjEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIiEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIhEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, used 0 barriers ptxas info : Compiling entry function '__nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb' for 'sm_90' ptxas info : Function properties for __nv_static_32__176c608a_10_onerank_cu_fdf47990__ZN43_GLOBAL__N__176c608a_10_onerank_cu_fdf4799013oneRankReduceI13FuncPreMulSumIaEEEvPvS3_mmb 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, used 0 barriers ../include/utils.h:43:13: warning: 'long int log2i(long int)' defined but not used [-Wunused-function] 43 | static long log2i(long n) { | ^~~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 680 bytes spill stores, 960 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 680 bytes spill stores, 960 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 680 bytes spill stores, 960 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 476 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 528 bytes stack frame, 440 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 692 bytes spill stores, 1028 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 536 bytes stack frame, 816 bytes spill stores, 1492 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 512 bytes stack frame, 420 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1180 bytes spill stores, 1648 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2069 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_SIMPLEv 496 bytes stack frame, 848 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1228 bytes spill stores, 1584 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 696 bytes spill stores, 976 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 696 bytes spill stores, 976 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 352 bytes stack frame, 472 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 488 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 408 bytes stack frame, 696 bytes spill stores, 976 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 384 bytes stack frame, 476 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 528 bytes stack frame, 440 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 424 bytes stack frame, 688 bytes spill stores, 1024 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 544 bytes stack frame, 812 bytes spill stores, 1488 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 512 bytes stack frame, 420 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1176 bytes spill stores, 1644 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_SIMPLEv 496 bytes stack frame, 848 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z61ncclDevFunc_ReduceScatter_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1224 bytes spill stores, 1580 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 448 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 448 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 448 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 480 bytes stack frame, 372 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 488 bytes stack frame, 824 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 456 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 596 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_i8_RING_LL128v 424 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_i8_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 512 bytes spill stores, 716 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 512 bytes spill stores, 716 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 512 bytes spill stores, 716 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 536 bytes stack frame, 452 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 532 bytes spill stores, 832 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 1997 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 448 bytes stack frame, 760 bytes spill stores, 1356 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 528 bytes stack frame, 432 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 840 bytes spill stores, 1224 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2973 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_RING_SIMPLEv 416 bytes stack frame, 748 bytes spill stores, 1288 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_ReduceScatter_MinMax_f16_RING_LL128v 464 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_ReduceScatter_MinMax_f16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_f16_NVLS_SIMPLEv 448 bytes stack frame, 996 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_ReduceScatter_MinMax_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 904 bytes spill stores, 1184 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/broadcast.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 538 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 328 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 538 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 328 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 538 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 280 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 328 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 538 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 312 bytes stack frame, 356 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 384 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 538 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 424 bytes stack frame, 720 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 376 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 538 bytes gmem ptxas info : Compiling entry function '_Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z31ncclDevKernel_Broadcast_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, used 16 barriers ptxas info : Function properties for _Z33ncclDevFunc_Broadcast_RING_SIMPLEv 368 bytes stack frame, 656 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z32ncclDevFunc_Broadcast_RING_LL128v 336 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z29ncclDevFunc_Broadcast_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 272 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 504 bytes stack frame, 400 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 468 bytes spill stores, 684 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 528 bytes stack frame, 428 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 528 bytes spill stores, 828 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 2041 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 384 bytes stack frame, 560 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 520 bytes stack frame, 424 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 792 bytes spill stores, 1172 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 3039 bytes gmem ptxas info : Compiling entry function '_Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z44ncclDevKernel_ReduceScatter_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, used 16 barriers ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_RING_SIMPLEv 336 bytes stack frame, 408 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_Sum_bf16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_ReduceScatter_Sum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_ReduceScatter_Sum_bf16_NVLS_SIMPLEv 424 bytes stack frame, 924 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_ReduceScatter_Sum_bf16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 840 bytes spill stores, 1140 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 464 bytes stack frame, 356 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 336 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 360 bytes stack frame, 628 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 332 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f64_RING_SIMPLEv 296 bytes stack frame, 396 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f64_RING_LL128v 440 bytes stack frame, 292 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f64_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 456 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 336 bytes stack frame, 440 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 464 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u32_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u32_RING_LL128v 440 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 440 bytes stack frame, 304 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 440 bytes stack frame, 296 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 440 bytes stack frame, 296 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 256 bytes stack frame, 288 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 480 bytes stack frame, 384 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 328 bytes stack frame, 440 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f32_RING_SIMPLEv 256 bytes stack frame, 276 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f32_RING_LL128v 432 bytes stack frame, 312 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 448 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 448 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 448 bytes stack frame, 344 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 312 bytes stack frame, 380 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 480 bytes stack frame, 372 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 496 bytes stack frame, 824 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 456 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 548 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_SumPostDiv_u8_RING_SIMPLEv 384 bytes stack frame, 596 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_SumPostDiv_u8_RING_LL128v 416 bytes stack frame, 312 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_SumPostDiv_u8_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 464 bytes stack frame, 344 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 488 bytes stack frame, 352 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 336 bytes stack frame, 452 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_u64_RING_SIMPLEv 272 bytes stack frame, 300 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u64_RING_LL128v 448 bytes stack frame, 316 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 526 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 526 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 526 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 526 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 526 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 344 bytes stack frame, 464 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 526 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f32_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f32_RING_LL128v 432 bytes stack frame, 304 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 456 bytes stack frame, 360 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 456 bytes stack frame, 344 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 456 bytes stack frame, 344 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 472 bytes stack frame, 376 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 320 bytes stack frame, 416 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f64_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f64_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 531 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 531 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 440 bytes stack frame, 332 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 531 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 440 bytes stack frame, 332 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 531 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 531 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 344 bytes stack frame, 468 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 531 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f32_RING_SIMPLEv 272 bytes stack frame, 312 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f32_RING_LL128v 432 bytes stack frame, 312 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 456 bytes stack frame, 344 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 344 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 344 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 264 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 472 bytes stack frame, 376 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 352 bytes stack frame, 580 bytes spill stores, 1008 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 456 bytes stack frame, 356 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u64_RING_SIMPLEv 280 bytes stack frame, 324 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u64_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 256 bytes stack frame, 288 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 480 bytes stack frame, 388 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 368 bytes stack frame, 668 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 530 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_f16_RING_SIMPLEv 304 bytes stack frame, 428 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_f16_RING_LL128v 432 bytes stack frame, 344 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_MinMax_f16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 432 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 472 bytes stack frame, 392 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 288 bytes stack frame, 344 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f64_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f64_RING_LL128v 416 bytes stack frame, 284 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 545 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 456 bytes stack frame, 332 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 545 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 440 bytes stack frame, 324 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 545 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 440 bytes stack frame, 324 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 545 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 545 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 344 bytes stack frame, 468 bytes spill stores, 860 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 545 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_u32_RING_SIMPLEv 272 bytes stack frame, 316 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u32_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_u32_RING_LLv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 540 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 540 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 540 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 540 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 540 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 344 bytes stack frame, 464 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 456 bytes stack frame, 352 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 540 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u32_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u32_RING_LL128v 432 bytes stack frame, 304 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u32_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 288 bytes stack frame, 376 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 464 bytes stack frame, 348 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 288 bytes stack frame, 376 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 464 bytes stack frame, 348 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 288 bytes stack frame, 376 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 464 bytes stack frame, 348 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 328 bytes stack frame, 440 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 472 bytes stack frame, 356 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 616 bytes stack frame, 1100 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 464 bytes stack frame, 332 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_MinMax_u8_RING_SIMPLEv 344 bytes stack frame, 672 bytes spill stores, 1028 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_MinMax_u8_RING_LL128v 432 bytes stack frame, 320 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_MinMax_u8_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 448 bytes spill stores, 656 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 448 bytes spill stores, 656 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 280 bytes stack frame, 304 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 352 bytes stack frame, 448 bytes spill stores, 656 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 344 bytes stack frame, 412 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 520 bytes stack frame, 416 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 376 bytes stack frame, 492 bytes spill stores, 744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 400 bytes stack frame, 652 bytes spill stores, 1152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 528 bytes stack frame, 432 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 448 bytes stack frame, 716 bytes spill stores, 1060 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2065 bytes gmem ptxas info : Function properties for _Z52ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_SIMPLEv 352 bytes stack frame, 432 bytes spill stores, 876 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z62ncclDevFunc_ReduceScatter_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 416 bytes stack frame, 760 bytes spill stores, 964 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 324 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 324 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 324 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 472 bytes stack frame, 364 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 368 bytes stack frame, 668 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 544 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_Reduce_MinMax_bf16_RING_SIMPLEv 304 bytes stack frame, 428 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_MinMax_bf16_RING_LL128v 432 bytes stack frame, 344 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_MinMax_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 464 bytes stack frame, 348 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 240 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 440 bytes stack frame, 332 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 224 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 464 bytes stack frame, 348 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 480 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 312 bytes stack frame, 404 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 480 bytes stack frame, 376 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 533 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_f16_RING_SIMPLEv 232 bytes stack frame, 232 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_f16_RING_LL128v 432 bytes stack frame, 300 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_Reduce_PreMulSum_f16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 232 bytes stack frame, 252 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 480 bytes stack frame, 376 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 296 bytes stack frame, 360 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 472 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 528 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_f16_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_f16_RING_LL128v 440 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_f16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 256 bytes stack frame, 288 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 472 bytes stack frame, 372 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 296 bytes stack frame, 360 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 472 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z40ncclDevFunc_Reduce_Prod_bf16_RING_SIMPLEv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_bf16_RING_LL128v 440 bytes stack frame, 328 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_Reduce_Prod_bf16_RING_LLv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 432 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 232 bytes stack frame, 244 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 432 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 312 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 472 bytes stack frame, 380 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 352 bytes stack frame, 480 bytes spill stores, 940 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 448 bytes stack frame, 336 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z39ncclDevFunc_Reduce_Prod_u64_RING_SIMPLEv 264 bytes stack frame, 296 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u64_RING_LL128v 400 bytes stack frame, 260 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z35ncclDevFunc_Reduce_Prod_u64_RING_LLv 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_scatter_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 496 bytes stack frame, 376 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 508 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 496 bytes stack frame, 376 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 508 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 280 bytes stack frame, 300 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 496 bytes stack frame, 376 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 360 bytes stack frame, 508 bytes spill stores, 712 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 528 bytes stack frame, 428 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 384 bytes stack frame, 556 bytes spill stores, 808 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 2053 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 448 bytes stack frame, 760 bytes spill stores, 1356 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 528 bytes stack frame, 432 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 840 bytes spill stores, 1224 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 3057 bytes gmem ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_RING_SIMPLEv 416 bytes stack frame, 748 bytes spill stores, 1288 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LL128v 464 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_ReduceScatter_MinMax_bf16_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_ReduceScatter_MinMax_bf16_NVLS_SIMPLEv 448 bytes stack frame, 996 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z59ncclDevFunc_ReduceScatter_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 904 bytes spill stores, 1184 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 264 bytes stack frame, 316 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 464 bytes stack frame, 356 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 432 bytes stack frame, 776 bytes spill stores, 1300 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 464 bytes stack frame, 348 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_Reduce_PreMulSum_u8_RING_SIMPLEv 336 bytes stack frame, 648 bytes spill stores, 1016 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_Reduce_PreMulSum_u8_RING_LL128v 416 bytes stack frame, 292 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_Reduce_PreMulSum_u8_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 240 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 264 bytes stack frame, 304 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 440 bytes stack frame, 828 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 456 bytes stack frame, 328 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 542 bytes gmem ptxas info : Function properties for _Z38ncclDevFunc_Reduce_Prod_u8_RING_SIMPLEv 336 bytes stack frame, 652 bytes spill stores, 1020 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_Reduce_Prod_u8_RING_LL128v 432 bytes stack frame, 320 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z34ncclDevFunc_Reduce_Prod_u8_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 208 bytes stack frame, 212 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 256 bytes stack frame, 288 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 480 bytes stack frame, 380 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 404 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 480 bytes stack frame, 376 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 547 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_Reduce_PreMulSum_bf16_RING_SIMPLEv 232 bytes stack frame, 232 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_Reduce_PreMulSum_bf16_RING_LL128v 432 bytes stack frame, 300 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_Reduce_PreMulSum_bf16_RING_LLv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5027 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1152 bytes spill stores, 1736 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5027 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1152 bytes spill stores, 1736 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5027 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1152 bytes spill stores, 1736 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5027 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 400 bytes stack frame, 912 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1180 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5027 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 536 bytes stack frame, 1336 bytes spill stores, 2032 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 360 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 424 bytes stack frame, 796 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1396 bytes spill stores, 2588 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1168 bytes spill stores, 2220 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5027 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_TREE_SIMPLEv 520 bytes stack frame, 1324 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_TREE_LL128v 456 bytes stack frame, 284 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u32_RING_SIMPLEv 368 bytes stack frame, 512 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u32_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1560 bytes spill stores, 2468 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u32_COLLNET_CHAIN_SIMPLEv 544 bytes stack frame, 1136 bytes spill stores, 1984 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 320 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1168 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 972 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 320 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1168 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 972 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 480 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 320 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1168 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 972 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 416 bytes stack frame, 932 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 344 bytes stack frame, 528 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1352 bytes spill stores, 2304 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 568 bytes stack frame, 1272 bytes spill stores, 2036 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 376 bytes stack frame, 544 bytes spill stores, 1216 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 1732 bytes spill stores, 2900 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1020 bytes spill stores, 1880 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_TREE_SIMPLEv 536 bytes stack frame, 1324 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_TREE_LL128v 464 bytes stack frame, 284 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_TREE_LLv 328 bytes stack frame, 500 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f64_RING_SIMPLEv 336 bytes stack frame, 420 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f64_RING_LL128v 472 bytes stack frame, 348 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1900 bytes spill stores, 2896 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f64_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1128 bytes spill stores, 1992 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4887 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4887 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4887 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4887 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 400 bytes stack frame, 912 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 352 bytes stack frame, 560 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1252 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4887 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 528 bytes stack frame, 1300 bytes spill stores, 1980 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 360 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 424 bytes stack frame, 796 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1452 bytes spill stores, 2532 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1168 bytes spill stores, 2220 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4887 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_TREE_SIMPLEv 520 bytes stack frame, 1300 bytes spill stores, 2032 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_TREE_LL128v 480 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f32_RING_SIMPLEv 368 bytes stack frame, 596 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f32_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1604 bytes spill stores, 2440 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f32_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1140 bytes spill stores, 1988 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 480 bytes stack frame, 344 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 448 bytes stack frame, 312 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1152 bytes spill stores, 1912 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 480 bytes stack frame, 344 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1152 bytes spill stores, 1912 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 480 bytes stack frame, 344 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 440 bytes stack frame, 288 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1152 bytes spill stores, 1912 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 416 bytes stack frame, 932 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 552 bytes stack frame, 404 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1368 bytes spill stores, 2396 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1020 bytes spill stores, 1876 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 560 bytes stack frame, 1296 bytes spill stores, 2052 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 552 bytes stack frame, 404 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 368 bytes stack frame, 540 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 408 bytes stack frame, 816 bytes spill stores, 1736 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 504 bytes stack frame, 376 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 752 bytes stack frame, 1808 bytes spill stores, 2964 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 2204 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_TREE_SIMPLEv 536 bytes stack frame, 1336 bytes spill stores, 2064 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LL128v 472 bytes stack frame, 288 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_TREE_LLv 336 bytes stack frame, 508 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f64_RING_SIMPLEv 360 bytes stack frame, 544 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f64_RING_LL128v 456 bytes stack frame, 304 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_DIRECT_SIMPLEv 776 bytes stack frame, 2176 bytes spill stores, 3244 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f64_COLLNET_CHAIN_SIMPLEv 520 bytes stack frame, 1136 bytes spill stores, 2224 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 368 bytes stack frame, 792 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 488 bytes stack frame, 356 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 312 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 368 bytes stack frame, 792 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 488 bytes stack frame, 356 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 312 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 368 bytes stack frame, 792 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 488 bytes stack frame, 356 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 312 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 392 bytes stack frame, 800 bytes spill stores, 1384 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 536 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1392 bytes spill stores, 2352 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1796 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 552 bytes stack frame, 1264 bytes spill stores, 2116 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 536 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 416 bytes stack frame, 832 bytes spill stores, 1796 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1636 bytes spill stores, 3044 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1172 bytes spill stores, 2224 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_TREE_SIMPLEv 544 bytes stack frame, 1396 bytes spill stores, 2124 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_TREE_LL128v 472 bytes stack frame, 296 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_TREE_LLv 328 bytes stack frame, 500 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_u64_RING_SIMPLEv 376 bytes stack frame, 672 bytes spill stores, 1436 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u64_RING_LL128v 464 bytes stack frame, 332 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_u64_COLLNET_DIRECT_SIMPLEv 704 bytes stack frame, 1736 bytes spill stores, 2828 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u64_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1136 bytes spill stores, 2248 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4937 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 384 bytes stack frame, 856 bytes spill stores, 1400 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 488 bytes stack frame, 348 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1072 bytes spill stores, 1740 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 968 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4937 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 384 bytes stack frame, 856 bytes spill stores, 1400 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 488 bytes stack frame, 348 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1072 bytes spill stores, 1740 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 968 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4937 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 384 bytes stack frame, 856 bytes spill stores, 1400 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 488 bytes stack frame, 348 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1072 bytes spill stores, 1740 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 968 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4937 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 400 bytes stack frame, 916 bytes spill stores, 1528 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 360 bytes stack frame, 560 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 2148 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1020 bytes spill stores, 1876 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4937 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 512 bytes stack frame, 1336 bytes spill stores, 2008 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 368 bytes stack frame, 564 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1488 bytes spill stores, 2520 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1216 bytes spill stores, 2236 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4937 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_TREE_SIMPLEv 528 bytes stack frame, 1324 bytes spill stores, 2072 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LL128v 480 bytes stack frame, 324 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f32_RING_SIMPLEv 368 bytes stack frame, 612 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f32_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1632 bytes spill stores, 2516 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1152 bytes spill stores, 2232 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 368 bytes stack frame, 756 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 496 bytes stack frame, 356 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 312 bytes stack frame, 524 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1208 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1220 bytes spill stores, 2144 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 368 bytes stack frame, 756 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 496 bytes stack frame, 356 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 312 bytes stack frame, 524 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1208 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1220 bytes spill stores, 2144 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 368 bytes stack frame, 756 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 496 bytes stack frame, 356 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 312 bytes stack frame, 524 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 304 bytes stack frame, 368 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 472 bytes stack frame, 328 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1208 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1220 bytes spill stores, 2144 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 392 bytes stack frame, 800 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 568 bytes stack frame, 396 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 336 bytes stack frame, 408 bytes spill stores, 896 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 520 bytes stack frame, 388 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1328 bytes spill stores, 2184 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1252 bytes spill stores, 2200 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 480 bytes stack frame, 1316 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 552 bytes stack frame, 400 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 472 bytes stack frame, 936 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 680 bytes stack frame, 1748 bytes spill stores, 3024 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 568 bytes stack frame, 1320 bytes spill stores, 2544 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_TREE_SIMPLEv 472 bytes stack frame, 1268 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_TREE_LL128v 472 bytes stack frame, 288 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_TREE_LLv 328 bytes stack frame, 504 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f64_RING_SIMPLEv 424 bytes stack frame, 852 bytes spill stores, 1732 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f64_RING_LL128v 456 bytes stack frame, 316 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f64_COLLNET_DIRECT_SIMPLEv 688 bytes stack frame, 1832 bytes spill stores, 2860 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f64_COLLNET_CHAIN_SIMPLEv 568 bytes stack frame, 1344 bytes spill stores, 2448 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 368 bytes stack frame, 748 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 504 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1248 bytes spill stores, 1944 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 984 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 368 bytes stack frame, 748 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 504 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1248 bytes spill stores, 1944 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 984 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 368 bytes stack frame, 748 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 504 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1248 bytes spill stores, 1944 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 984 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 392 bytes stack frame, 816 bytes spill stores, 1448 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 544 bytes stack frame, 388 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 512 bytes stack frame, 388 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1436 bytes spill stores, 2612 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1020 bytes spill stores, 1876 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 568 bytes stack frame, 1328 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 544 bytes stack frame, 388 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 368 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 440 bytes stack frame, 872 bytes spill stores, 1868 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 504 bytes stack frame, 376 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1664 bytes spill stores, 3032 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1284 bytes spill stores, 2276 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_TREE_SIMPLEv 560 bytes stack frame, 1440 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LL128v 488 bytes stack frame, 328 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_TREE_LLv 336 bytes stack frame, 508 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u64_RING_SIMPLEv 400 bytes stack frame, 808 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u64_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1780 bytes spill stores, 2836 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u64_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1152 bytes spill stores, 2232 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 376 bytes stack frame, 840 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 392 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 368 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 336 bytes stack frame, 420 bytes spill stores, 932 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1280 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1032 bytes spill stores, 1844 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 376 bytes stack frame, 840 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 392 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 368 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 336 bytes stack frame, 420 bytes spill stores, 932 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1280 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1032 bytes spill stores, 1844 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 376 bytes stack frame, 840 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 392 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 368 bytes stack frame, 512 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 336 bytes stack frame, 420 bytes spill stores, 932 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1280 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1032 bytes spill stores, 1844 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 392 bytes stack frame, 868 bytes spill stores, 1444 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 528 bytes stack frame, 392 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 392 bytes stack frame, 548 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 360 bytes stack frame, 452 bytes spill stores, 968 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 404 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1380 bytes spill stores, 2288 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 464 bytes stack frame, 1068 bytes spill stores, 1960 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 576 bytes stack frame, 1376 bytes spill stores, 2240 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 528 bytes stack frame, 392 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 392 bytes stack frame, 552 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 456 bytes stack frame, 928 bytes spill stores, 1884 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 2048 bytes spill stores, 3384 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1348 bytes spill stores, 2372 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_SIMPLEv 568 bytes stack frame, 1504 bytes spill stores, 2316 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LL128v 504 bytes stack frame, 348 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_TREE_LLv 400 bytes stack frame, 560 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i64_RING_SIMPLEv 424 bytes stack frame, 764 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LL128v 488 bytes stack frame, 384 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i64_RING_LLv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 2020 bytes spill stores, 3100 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i64_COLLNET_CHAIN_SIMPLEv 560 bytes stack frame, 1356 bytes spill stores, 2540 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5077 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 384 bytes stack frame, 852 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 472 bytes stack frame, 332 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1096 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5077 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 384 bytes stack frame, 852 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 472 bytes stack frame, 332 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1096 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5077 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 384 bytes stack frame, 852 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 472 bytes stack frame, 332 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 352 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1096 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5077 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 400 bytes stack frame, 916 bytes spill stores, 1528 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 552 bytes stack frame, 400 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 360 bytes stack frame, 568 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 512 bytes stack frame, 384 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1180 bytes spill stores, 1864 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1020 bytes spill stores, 1876 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5077 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 544 bytes stack frame, 1360 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 368 bytes stack frame, 564 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 408 bytes stack frame, 792 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 504 bytes stack frame, 380 bytes spill stores, 592 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1440 bytes spill stores, 2620 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1216 bytes spill stores, 2236 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5077 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_TREE_SIMPLEv 544 bytes stack frame, 1336 bytes spill stores, 2036 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LL128v 464 bytes stack frame, 292 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_TREE_LLv 344 bytes stack frame, 544 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_u32_RING_SIMPLEv 368 bytes stack frame, 608 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u32_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1564 bytes spill stores, 2432 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1152 bytes spill stores, 2232 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 368 bytes stack frame, 840 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1088 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1024 bytes spill stores, 1760 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 368 bytes stack frame, 840 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1088 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1024 bytes spill stores, 1760 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 368 bytes stack frame, 840 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1088 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1024 bytes spill stores, 1760 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 392 bytes stack frame, 896 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 360 bytes stack frame, 552 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1180 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1092 bytes spill stores, 1784 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 544 bytes stack frame, 1336 bytes spill stores, 2016 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 368 bytes stack frame, 568 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 432 bytes stack frame, 852 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1464 bytes spill stores, 2612 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1260 bytes spill stores, 2136 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_SIMPLEv 544 bytes stack frame, 1328 bytes spill stores, 2028 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LL128v 456 bytes stack frame, 284 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_i32_RING_SIMPLEv 384 bytes stack frame, 692 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_i32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1568 bytes spill stores, 2476 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i32_COLLNET_CHAIN_SIMPLEv 544 bytes stack frame, 1288 bytes spill stores, 2092 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 376 bytes stack frame, 840 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 376 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 320 bytes stack frame, 376 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 448 bytes stack frame, 1020 bytes spill stores, 1920 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 376 bytes stack frame, 840 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 376 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 320 bytes stack frame, 376 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 448 bytes stack frame, 1020 bytes spill stores, 1920 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 376 bytes stack frame, 840 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 376 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 320 bytes stack frame, 376 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 448 bytes stack frame, 1020 bytes spill stores, 1920 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 392 bytes stack frame, 812 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 528 bytes stack frame, 416 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 344 bytes stack frame, 420 bytes spill stores, 916 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 512 bytes stack frame, 404 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1352 bytes spill stores, 2256 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 464 bytes stack frame, 1076 bytes spill stores, 1952 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 576 bytes stack frame, 1356 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 528 bytes stack frame, 400 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 532 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 432 bytes stack frame, 848 bytes spill stores, 1848 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1924 bytes spill stores, 3196 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1376 bytes spill stores, 2416 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_SIMPLEv 568 bytes stack frame, 1452 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LL128v 504 bytes stack frame, 348 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_TREE_LLv 368 bytes stack frame, 556 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u64_RING_SIMPLEv 408 bytes stack frame, 708 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LL128v 488 bytes stack frame, 384 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u64_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 1948 bytes spill stores, 3000 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u64_COLLNET_CHAIN_SIMPLEv 560 bytes stack frame, 1400 bytes spill stores, 2528 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5087 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 368 bytes stack frame, 800 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1016 bytes spill stores, 1756 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5087 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 368 bytes stack frame, 800 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1016 bytes spill stores, 1756 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5087 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 368 bytes stack frame, 800 bytes spill stores, 1336 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1016 bytes spill stores, 1756 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5087 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 392 bytes stack frame, 896 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 360 bytes stack frame, 552 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1180 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1072 bytes spill stores, 1772 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5087 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 544 bytes stack frame, 1344 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 368 bytes stack frame, 568 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 440 bytes stack frame, 868 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1484 bytes spill stores, 2656 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 528 bytes stack frame, 1260 bytes spill stores, 2316 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5087 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_SIMPLEv 536 bytes stack frame, 1320 bytes spill stores, 2020 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LL128v 456 bytes stack frame, 284 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_SumPostDiv_u32_RING_SIMPLEv 384 bytes stack frame, 684 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_SumPostDiv_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1616 bytes spill stores, 2500 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u32_COLLNET_CHAIN_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 2116 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 424 bytes stack frame, 956 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 528 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 328 bytes stack frame, 560 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 448 bytes stack frame, 320 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1220 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1028 bytes spill stores, 1804 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 424 bytes stack frame, 956 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 528 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 328 bytes stack frame, 560 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 448 bytes stack frame, 312 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1220 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1028 bytes spill stores, 1804 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 424 bytes stack frame, 956 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 528 bytes stack frame, 392 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 328 bytes stack frame, 560 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 448 bytes stack frame, 312 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1220 bytes spill stores, 1776 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1028 bytes spill stores, 1804 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 448 bytes stack frame, 1028 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 568 bytes stack frame, 404 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 360 bytes stack frame, 564 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 336 bytes stack frame, 412 bytes spill stores, 904 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 504 bytes stack frame, 388 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1236 bytes spill stores, 1852 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1084 bytes spill stores, 1928 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 536 bytes stack frame, 1372 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 568 bytes stack frame, 404 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 368 bytes stack frame, 576 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 416 bytes stack frame, 852 bytes spill stores, 1784 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1704 bytes spill stores, 2796 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 560 bytes stack frame, 1308 bytes spill stores, 2532 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_TREE_SIMPLEv 520 bytes stack frame, 1364 bytes spill stores, 2116 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_TREE_LL128v 520 bytes stack frame, 332 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_TREE_LLv 344 bytes stack frame, 536 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f32_RING_SIMPLEv 384 bytes stack frame, 692 bytes spill stores, 1516 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f32_RING_LL128v 448 bytes stack frame, 304 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f32_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1792 bytes spill stores, 2868 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f32_COLLNET_CHAIN_SIMPLEv 568 bytes stack frame, 1344 bytes spill stores, 2444 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 440 bytes stack frame, 1068 bytes spill stores, 1796 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 560 bytes stack frame, 460 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 320 bytes stack frame, 396 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 448 bytes stack frame, 328 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 488 bytes stack frame, 352 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 312 bytes stack frame, 520 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 480 bytes stack frame, 360 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1096 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 440 bytes stack frame, 1068 bytes spill stores, 1796 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 560 bytes stack frame, 460 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 320 bytes stack frame, 396 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 448 bytes stack frame, 328 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 400 bytes stack frame, 924 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 528 bytes stack frame, 452 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1276 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 560 bytes stack frame, 1412 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 400 bytes stack frame, 700 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1516 bytes spill stores, 2616 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1224 bytes spill stores, 2112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4907 bytes gmem ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_TREE_SIMPLEv 552 bytes stack frame, 1356 bytes spill stores, 2168 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_TREE_LL128v 464 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_TREE_LLv 336 bytes stack frame, 512 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_f16_RING_SIMPLEv 368 bytes stack frame, 496 bytes spill stores, 1044 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_f16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Prod_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_f16_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1660 bytes spill stores, 2740 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_f16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1280 bytes spill stores, 2176 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 300 bytes spill stores, 504 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 536 bytes stack frame, 1344 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 464 bytes stack frame, 296 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 344 bytes stack frame, 472 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 424 bytes stack frame, 280 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1424 bytes spill stores, 2360 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1128 bytes spill stores, 1980 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 300 bytes spill stores, 504 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 536 bytes stack frame, 1344 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 464 bytes stack frame, 296 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 344 bytes stack frame, 472 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 424 bytes stack frame, 280 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1424 bytes spill stores, 2360 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1128 bytes spill stores, 1980 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 300 bytes spill stores, 504 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 536 bytes stack frame, 1344 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 464 bytes stack frame, 296 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 344 bytes stack frame, 472 bytes spill stores, 1128 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 424 bytes stack frame, 280 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1424 bytes spill stores, 2360 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1128 bytes spill stores, 1980 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE 152 bytes stack frame, 304 bytes spill stores, 512 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 576 bytes stack frame, 1448 bytes spill stores, 2284 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 536 bytes stack frame, 380 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 352 bytes stack frame, 548 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 360 bytes stack frame, 464 bytes spill stores, 1068 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 472 bytes stack frame, 340 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 576 bytes stack frame, 1528 bytes spill stores, 2316 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1220 bytes spill stores, 2116 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE 152 bytes stack frame, 308 bytes spill stores, 512 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 984 bytes stack frame, 2904 bytes spill stores, 4144 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 536 bytes stack frame, 380 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 360 bytes stack frame, 556 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 560 bytes stack frame, 1128 bytes spill stores, 2492 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 464 bytes stack frame, 328 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 864 bytes stack frame, 3384 bytes spill stores, 5684 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 640 bytes stack frame, 1948 bytes spill stores, 3360 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 276 bytes spill stores, 468 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z38ncclDevKernel_AllReduce_Sum_u8_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, used 16 barriers ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_TREE_SIMPLEv 992 bytes stack frame, 3108 bytes spill stores, 4248 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_TREE_LL128v 448 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_TREE_LLv 344 bytes stack frame, 520 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u8_RING_SIMPLEv 544 bytes stack frame, 1212 bytes spill stores, 2108 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Sum_u8_RING_LL128v 432 bytes stack frame, 292 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z36ncclDevFunc_AllReduce_Sum_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u8_COLLNET_DIRECT_SIMPLEv 880 bytes stack frame, 3952 bytes spill stores, 6464 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_Sum_u8_COLLNET_CHAIN_SIMPLEv 664 bytes stack frame, 2048 bytes spill stores, 3396 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 440 bytes stack frame, 1056 bytes spill stores, 1804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 568 bytes stack frame, 460 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 496 bytes stack frame, 1056 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 408 bytes stack frame, 896 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 488 bytes stack frame, 344 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 320 bytes stack frame, 528 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 488 bytes stack frame, 1104 bytes spill stores, 1688 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 960 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 440 bytes stack frame, 1056 bytes spill stores, 1804 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 568 bytes stack frame, 460 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 440 bytes stack frame, 308 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 480 bytes stack frame, 1112 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 432 bytes stack frame, 952 bytes spill stores, 1612 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 352 bytes stack frame, 528 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 520 bytes stack frame, 408 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1172 bytes spill stores, 1856 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1020 bytes spill stores, 1876 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 592 bytes stack frame, 1560 bytes spill stores, 2436 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 360 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 440 bytes stack frame, 864 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 512 bytes stack frame, 404 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 648 bytes stack frame, 1568 bytes spill stores, 2720 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 512 bytes stack frame, 1304 bytes spill stores, 2324 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4957 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_TREE_SIMPLEv 568 bytes stack frame, 1528 bytes spill stores, 2288 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LL128v 480 bytes stack frame, 312 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_TREE_LLv 336 bytes stack frame, 504 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_f16_RING_SIMPLEv 368 bytes stack frame, 540 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_f16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_PreMulSum_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1692 bytes spill stores, 2732 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_f16_COLLNET_CHAIN_SIMPLEv 520 bytes stack frame, 1264 bytes spill stores, 2364 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 464 bytes stack frame, 1196 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 504 bytes stack frame, 360 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 524 bytes spill stores, 1644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1408 bytes spill stores, 2344 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1164 bytes spill stores, 2084 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 464 bytes stack frame, 1196 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 504 bytes stack frame, 360 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 524 bytes spill stores, 1644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1408 bytes spill stores, 2344 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1164 bytes spill stores, 2084 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 464 bytes stack frame, 1196 bytes spill stores, 1916 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 504 bytes stack frame, 360 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 376 bytes stack frame, 524 bytes spill stores, 1644 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 424 bytes stack frame, 284 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1408 bytes spill stores, 2344 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1164 bytes spill stores, 2084 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 496 bytes stack frame, 1312 bytes spill stores, 2052 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 360 bytes stack frame, 556 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 416 bytes stack frame, 776 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1476 bytes spill stores, 2068 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1212 bytes spill stores, 2244 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 1048 bytes stack frame, 3100 bytes spill stores, 4276 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 536 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 368 bytes stack frame, 556 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 680 bytes stack frame, 1244 bytes spill stores, 2608 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 480 bytes stack frame, 348 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 792 bytes stack frame, 3324 bytes spill stores, 5640 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 832 bytes stack frame, 2108 bytes spill stores, 3516 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_TREE_SIMPLEv 1104 bytes stack frame, 3408 bytes spill stores, 4608 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_TREE_LL128v 464 bytes stack frame, 280 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_TREE_LLv 352 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u8_RING_SIMPLEv 600 bytes stack frame, 1200 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_MinMax_u8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_MinMax_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u8_COLLNET_DIRECT_SIMPLEv 936 bytes stack frame, 4204 bytes spill stores, 6844 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_MinMax_u8_COLLNET_CHAIN_SIMPLEv 656 bytes stack frame, 2012 bytes spill stores, 3348 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 408 bytes stack frame, 1020 bytes spill stores, 1652 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 480 bytes stack frame, 336 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 480 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 424 bytes stack frame, 280 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1540 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1160 bytes spill stores, 1944 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 408 bytes stack frame, 1020 bytes spill stores, 1652 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 480 bytes stack frame, 336 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 480 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 424 bytes stack frame, 280 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1540 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1160 bytes spill stores, 1944 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 408 bytes stack frame, 1020 bytes spill stores, 1652 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 480 bytes stack frame, 336 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 352 bytes stack frame, 480 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 424 bytes stack frame, 280 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1540 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1160 bytes spill stores, 1944 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 440 bytes stack frame, 1080 bytes spill stores, 1864 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 352 bytes stack frame, 544 bytes spill stores, 748 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 368 bytes stack frame, 468 bytes spill stores, 1072 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1644 bytes spill stores, 2532 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1224 bytes spill stores, 2120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 976 bytes stack frame, 2976 bytes spill stores, 4192 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 360 bytes stack frame, 556 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 560 bytes stack frame, 1168 bytes spill stores, 2516 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 480 bytes stack frame, 336 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 3408 bytes spill stores, 5704 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 640 bytes stack frame, 1988 bytes spill stores, 3452 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_TREE_SIMPLEv 976 bytes stack frame, 3148 bytes spill stores, 4268 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_TREE_LL128v 464 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_TREE_LLv 344 bytes stack frame, 520 bytes spill stores, 864 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Prod_u8_RING_SIMPLEv 544 bytes stack frame, 1188 bytes spill stores, 2108 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Prod_u8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Prod_u8_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Prod_u8_COLLNET_DIRECT_SIMPLEv 880 bytes stack frame, 3968 bytes spill stores, 6492 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Prod_u8_COLLNET_CHAIN_SIMPLEv 664 bytes stack frame, 2068 bytes spill stores, 3408 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 424 bytes stack frame, 940 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 664 bytes stack frame, 264 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1116 bytes spill stores, 2012 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 424 bytes stack frame, 940 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 664 bytes stack frame, 264 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1116 bytes spill stores, 2012 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 424 bytes stack frame, 940 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 664 bytes stack frame, 264 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1116 bytes spill stores, 2012 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 448 bytes stack frame, 980 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 696 bytes stack frame, 300 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 352 bytes stack frame, 544 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 480 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1276 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1180 bytes spill stores, 2120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 560 bytes stack frame, 1412 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 400 bytes stack frame, 700 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1520 bytes spill stores, 2628 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1224 bytes spill stores, 2112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_TREE_SIMPLEv 552 bytes stack frame, 1356 bytes spill stores, 2168 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_TREE_LL128v 464 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_TREE_LLv 336 bytes stack frame, 512 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_Prod_bf16_RING_SIMPLEv 368 bytes stack frame, 496 bytes spill stores, 1044 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Prod_bf16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z39ncclDevFunc_AllReduce_Prod_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_Prod_bf16_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1660 bytes spill stores, 2740 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Prod_bf16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1280 bytes spill stores, 2176 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 376 bytes stack frame, 776 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 376 bytes stack frame, 776 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 376 bytes stack frame, 776 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 512 bytes stack frame, 360 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 340 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 400 bytes stack frame, 884 bytes spill stores, 1492 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 552 bytes stack frame, 388 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 360 bytes stack frame, 540 bytes spill stores, 736 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 520 bytes stack frame, 384 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1156 bytes spill stores, 1836 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1072 bytes spill stores, 1792 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 552 bytes stack frame, 1312 bytes spill stores, 2108 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 560 bytes stack frame, 388 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 368 bytes stack frame, 548 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 424 bytes stack frame, 832 bytes spill stores, 1972 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 520 bytes stack frame, 376 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1428 bytes spill stores, 2524 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1188 bytes spill stores, 2072 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 9077 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_TREE_SIMPLEv 560 bytes stack frame, 1364 bytes spill stores, 2084 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_TREE_LL128v 504 bytes stack frame, 340 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_TREE_LLv 336 bytes stack frame, 508 bytes spill stores, 804 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_RING_SIMPLEv 384 bytes stack frame, 780 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u64_RING_LL128v 480 bytes stack frame, 344 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u64_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_u64_NVLS_TREE_SIMPLEv 504 bytes stack frame, 1260 bytes spill stores, 2504 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u64_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u64_COLLNET_DIRECT_SIMPLEv 632 bytes stack frame, 1564 bytes spill stores, 2424 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u64_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1140 bytes spill stores, 2236 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 368 bytes stack frame, 788 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 472 bytes stack frame, 316 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 432 bytes stack frame, 276 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1152 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 368 bytes stack frame, 788 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 472 bytes stack frame, 328 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 432 bytes stack frame, 276 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1152 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 368 bytes stack frame, 788 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 472 bytes stack frame, 328 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 336 bytes stack frame, 568 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 432 bytes stack frame, 276 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 472 bytes stack frame, 1152 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 992 bytes spill stores, 1744 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 392 bytes stack frame, 900 bytes spill stores, 1512 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 560 bytes stack frame, 396 bytes spill stores, 604 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 360 bytes stack frame, 560 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 496 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 528 bytes stack frame, 1152 bytes spill stores, 1784 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1072 bytes spill stores, 1792 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5047 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 544 bytes stack frame, 1352 bytes spill stores, 2100 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 568 bytes stack frame, 404 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 368 bytes stack frame, 564 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 416 bytes stack frame, 824 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 496 bytes stack frame, 356 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1368 bytes spill stores, 2408 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1216 bytes spill stores, 2236 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 9041 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_TREE_SIMPLEv 536 bytes stack frame, 1356 bytes spill stores, 2060 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_TREE_LL128v 456 bytes stack frame, 272 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_TREE_LLv 344 bytes stack frame, 548 bytes spill stores, 884 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_RING_SIMPLEv 376 bytes stack frame, 660 bytes spill stores, 1264 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_u32_RING_LL128v 448 bytes stack frame, 304 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_u32_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_u32_NVLS_TREE_SIMPLEv 512 bytes stack frame, 1308 bytes spill stores, 3408 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_u32_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_u32_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1512 bytes spill stores, 2256 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_u32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1128 bytes spill stores, 2240 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 288 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 320 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1168 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 972 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 288 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 320 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1168 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 972 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 288 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 392 bytes stack frame, 828 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 480 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 320 bytes stack frame, 532 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 464 bytes stack frame, 336 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1168 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 972 bytes spill stores, 1740 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 284 bytes spill stores, 468 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 416 bytes stack frame, 932 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 344 bytes stack frame, 528 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1352 bytes spill stores, 2304 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 288 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 568 bytes stack frame, 1272 bytes spill stores, 2036 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 352 bytes stack frame, 536 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 376 bytes stack frame, 544 bytes spill stores, 1216 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 1732 bytes spill stores, 2900 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1020 bytes spill stores, 1880 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 8771 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 252 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_TREE_SIMPLEv 536 bytes stack frame, 1324 bytes spill stores, 2040 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_TREE_LL128v 464 bytes stack frame, 284 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_TREE_LLv 328 bytes stack frame, 500 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_RING_SIMPLEv 336 bytes stack frame, 420 bytes spill stores, 900 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f64_RING_LL128v 472 bytes stack frame, 348 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_f64_NVLS_TREE_SIMPLEv 512 bytes stack frame, 1188 bytes spill stores, 3184 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f64_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1900 bytes spill stores, 2896 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f64_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1128 bytes spill stores, 1992 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 536 bytes stack frame, 1468 bytes spill stores, 2284 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 808 bytes stack frame, 412 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 492 bytes spill stores, 1148 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1432 bytes spill stores, 2368 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1164 bytes spill stores, 1952 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 536 bytes stack frame, 1468 bytes spill stores, 2284 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 808 bytes stack frame, 412 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 492 bytes spill stores, 1148 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1432 bytes spill stores, 2368 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1164 bytes spill stores, 1952 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 536 bytes stack frame, 1468 bytes spill stores, 2284 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 808 bytes stack frame, 412 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 344 bytes stack frame, 560 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 344 bytes stack frame, 492 bytes spill stores, 1148 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 424 bytes stack frame, 276 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1432 bytes spill stores, 2368 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 440 bytes stack frame, 1164 bytes spill stores, 1952 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 584 bytes stack frame, 1584 bytes spill stores, 2448 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 840 bytes stack frame, 444 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 368 bytes stack frame, 552 bytes spill stores, 868 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 368 bytes stack frame, 508 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 472 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1572 bytes spill stores, 2572 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 480 bytes stack frame, 1224 bytes spill stores, 2120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 1288 bytes stack frame, 3892 bytes spill stores, 5016 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 840 bytes stack frame, 448 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 368 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 576 bytes stack frame, 1224 bytes spill stores, 2528 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 720 bytes stack frame, 3232 bytes spill stores, 5492 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 648 bytes stack frame, 1972 bytes spill stores, 3392 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_TREE_SIMPLEv 1296 bytes stack frame, 4164 bytes spill stores, 5204 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LL128v 768 bytes stack frame, 376 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_TREE_LLv 352 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_PreMulSum_u8_RING_SIMPLEv 528 bytes stack frame, 1172 bytes spill stores, 2152 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_PreMulSum_u8_RING_LL128v 448 bytes stack frame, 316 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_PreMulSum_u8_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_DIRECT_SIMPLEv 768 bytes stack frame, 3732 bytes spill stores, 6244 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_PreMulSum_u8_COLLNET_CHAIN_SIMPLEv 664 bytes stack frame, 2068 bytes spill stores, 3444 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 288 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 368 bytes stack frame, 744 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 312 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 288 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 368 bytes stack frame, 744 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 312 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 288 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 368 bytes stack frame, 744 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 312 bytes stack frame, 536 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 480 bytes stack frame, 368 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1240 bytes spill stores, 1984 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 284 bytes spill stores, 468 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 392 bytes stack frame, 824 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 344 bytes stack frame, 532 bytes spill stores, 708 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 512 bytes stack frame, 408 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 616 bytes stack frame, 1352 bytes spill stores, 2256 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 288 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 568 bytes stack frame, 1336 bytes spill stores, 2184 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 352 bytes stack frame, 544 bytes spill stores, 728 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 416 bytes stack frame, 804 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 512 bytes stack frame, 412 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1788 bytes spill stores, 3164 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1168 bytes spill stores, 2220 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 9023 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 252 bytes spill stores, 436 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u64_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, used 16 barriers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_TREE_SIMPLEv 560 bytes stack frame, 1436 bytes spill stores, 2172 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_TREE_LL128v 472 bytes stack frame, 304 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_TREE_LLv 328 bytes stack frame, 500 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_RING_SIMPLEv 368 bytes stack frame, 596 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u64_RING_LL128v 488 bytes stack frame, 380 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u64_RING_LLv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_u64_NVLS_TREE_SIMPLEv 512 bytes stack frame, 1228 bytes spill stores, 3188 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u64_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u64_COLLNET_DIRECT_SIMPLEv 728 bytes stack frame, 1880 bytes spill stores, 2980 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u64_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1136 bytes spill stores, 2232 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4877 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 276 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4877 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 276 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4877 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 276 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 496 bytes stack frame, 360 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1076 bytes spill stores, 1748 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4877 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 160 bytes stack frame, 288 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 400 bytes stack frame, 912 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 352 bytes stack frame, 560 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1252 bytes spill stores, 2112 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4877 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 160 bytes stack frame, 288 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 528 bytes stack frame, 1300 bytes spill stores, 1980 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 360 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 424 bytes stack frame, 796 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 600 bytes stack frame, 1452 bytes spill stores, 2532 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1168 bytes spill stores, 2220 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 8735 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 256 bytes spill stores, 428 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, used 16 barriers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_TREE_SIMPLEv 520 bytes stack frame, 1300 bytes spill stores, 2032 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_TREE_LL128v 480 bytes stack frame, 316 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_RING_SIMPLEv 368 bytes stack frame, 596 bytes spill stores, 1176 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_f32_NVLS_TREE_SIMPLEv 608 bytes stack frame, 1564 bytes spill stores, 2780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f32_NVLS_SIMPLEv 640 bytes stack frame, 1852 bytes spill stores, 3632 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f32_COLLNET_DIRECT_SIMPLEv 592 bytes stack frame, 1604 bytes spill stores, 2440 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f32_COLLNET_CHAIN_SIMPLEv 552 bytes stack frame, 1140 bytes spill stores, 1988 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5017 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 276 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 384 bytes stack frame, 852 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5017 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 276 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 384 bytes stack frame, 852 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5017 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 276 bytes spill stores, 452 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 384 bytes stack frame, 852 bytes spill stores, 1420 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 464 bytes stack frame, 324 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 328 bytes stack frame, 564 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 304 bytes stack frame, 356 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 448 bytes stack frame, 324 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1084 bytes spill stores, 1756 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5017 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 160 bytes stack frame, 288 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 400 bytes stack frame, 912 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 528 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 352 bytes stack frame, 552 bytes spill stores, 764 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 552 bytes stack frame, 1180 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5017 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 160 bytes stack frame, 288 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 544 bytes stack frame, 1332 bytes spill stores, 2016 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 360 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 424 bytes stack frame, 796 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 504 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 624 bytes stack frame, 1392 bytes spill stores, 2568 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 536 bytes stack frame, 1168 bytes spill stores, 2220 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 8987 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 256 bytes spill stores, 428 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_u32_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, used 16 barriers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_TREE_SIMPLEv 536 bytes stack frame, 1324 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_TREE_LL128v 456 bytes stack frame, 284 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_TREE_LLv 344 bytes stack frame, 540 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_RING_SIMPLEv 368 bytes stack frame, 512 bytes spill stores, 1204 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_u32_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_u32_RING_LLv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_u32_NVLS_TREE_SIMPLEv 504 bytes stack frame, 1336 bytes spill stores, 3448 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_u32_NVLS_SIMPLEv 640 bytes stack frame, 1676 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_u32_COLLNET_DIRECT_SIMPLEv 640 bytes stack frame, 1548 bytes spill stores, 2444 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_u32_COLLNET_CHAIN_SIMPLEv 544 bytes stack frame, 1136 bytes spill stores, 1984 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 296 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 440 bytes stack frame, 1068 bytes spill stores, 1796 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 560 bytes stack frame, 460 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 320 bytes stack frame, 396 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 448 bytes stack frame, 328 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 120 bytes stack frame, 276 bytes spill stores, 416 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 384 bytes stack frame, 860 bytes spill stores, 1428 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 488 bytes stack frame, 352 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 312 bytes stack frame, 520 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 480 bytes stack frame, 360 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1096 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 976 bytes spill stores, 1752 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 296 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4416 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 440 bytes stack frame, 1068 bytes spill stores, 1796 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 560 bytes stack frame, 460 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 320 bytes stack frame, 396 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 448 bytes stack frame, 328 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 504 bytes stack frame, 1068 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 432 bytes stack frame, 1000 bytes spill stores, 1748 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 280 bytes spill stores, 472 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 400 bytes stack frame, 924 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 344 bytes stack frame, 508 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 328 bytes stack frame, 400 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 528 bytes stack frame, 452 bytes spill stores, 656 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 560 bytes stack frame, 1276 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 456 bytes stack frame, 1080 bytes spill stores, 1776 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 4897 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 292 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 560 bytes stack frame, 1412 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 400 bytes stack frame, 700 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1516 bytes spill stores, 2616 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1224 bytes spill stores, 2112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 8771 bytes gmem ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 260 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z39ncclDevKernel_AllReduce_Sum_f16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, used 16 barriers ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_TREE_SIMPLEv 552 bytes stack frame, 1356 bytes spill stores, 2168 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_TREE_LL128v 464 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_TREE_LLv 336 bytes stack frame, 512 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_RING_SIMPLEv 368 bytes stack frame, 496 bytes spill stores, 1044 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_Sum_f16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z37ncclDevFunc_AllReduce_Sum_f16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_Sum_f16_NVLS_TREE_SIMPLEv 624 bytes stack frame, 1680 bytes spill stores, 2884 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_f16_NVLS_SIMPLEv 664 bytes stack frame, 1968 bytes spill stores, 3132 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_f16_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1660 bytes spill stores, 2740 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_Sum_f16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1280 bytes spill stores, 2176 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 640 bytes stack frame, 1720 bytes spill stores, 2552 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 800 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 588 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1528 bytes spill stores, 2472 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1156 bytes spill stores, 2092 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 640 bytes stack frame, 1720 bytes spill stores, 2552 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 800 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 588 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1528 bytes spill stores, 2472 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1156 bytes spill stores, 2092 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 640 bytes stack frame, 1720 bytes spill stores, 2552 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 800 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 392 bytes stack frame, 588 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1528 bytes spill stores, 2472 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1156 bytes spill stores, 2092 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 632 bytes stack frame, 1772 bytes spill stores, 2628 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 832 bytes stack frame, 464 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 368 bytes stack frame, 544 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 416 bytes stack frame, 564 bytes spill stores, 1168 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1656 bytes spill stores, 2480 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1216 bytes spill stores, 2140 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 1128 bytes stack frame, 3836 bytes spill stores, 5064 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 840 bytes stack frame, 476 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 368 bytes stack frame, 568 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 600 bytes stack frame, 1140 bytes spill stores, 2492 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 856 bytes stack frame, 3532 bytes spill stores, 5768 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 648 bytes stack frame, 2080 bytes spill stores, 3480 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5107 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_SIMPLEv 1120 bytes stack frame, 4068 bytes spill stores, 5224 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LL128v 760 bytes stack frame, 396 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_TREE_LLv 352 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_u8_RING_SIMPLEv 544 bytes stack frame, 1248 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LL128v 456 bytes stack frame, 312 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_u8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 4092 bytes spill stores, 6572 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_u8_COLLNET_CHAIN_SIMPLEv 656 bytes stack frame, 2140 bytes spill stores, 3452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 648 bytes stack frame, 1592 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 800 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 592 bytes spill stores, 1216 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1492 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1152 bytes spill stores, 2088 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 648 bytes stack frame, 1592 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 800 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 592 bytes spill stores, 1216 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1492 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1152 bytes spill stores, 2088 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 648 bytes stack frame, 1592 bytes spill stores, 2404 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 800 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 336 bytes stack frame, 556 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 392 bytes stack frame, 592 bytes spill stores, 1216 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1492 bytes spill stores, 2412 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1152 bytes spill stores, 2088 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 632 bytes stack frame, 1772 bytes spill stores, 2624 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 832 bytes stack frame, 464 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 368 bytes stack frame, 544 bytes spill stores, 856 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 408 bytes stack frame, 560 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 568 bytes stack frame, 1656 bytes spill stores, 2500 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1196 bytes spill stores, 2100 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 1128 bytes stack frame, 3816 bytes spill stores, 5092 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 840 bytes stack frame, 476 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 368 bytes stack frame, 568 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 600 bytes stack frame, 1140 bytes spill stores, 2492 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 488 bytes stack frame, 356 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 856 bytes stack frame, 3496 bytes spill stores, 5744 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 648 bytes stack frame, 2076 bytes spill stores, 3480 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_SIMPLEv 1128 bytes stack frame, 4084 bytes spill stores, 5256 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LL128v 760 bytes stack frame, 396 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_TREE_LLv 352 bytes stack frame, 532 bytes spill stores, 880 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_SumPostDiv_i8_RING_SIMPLEv 552 bytes stack frame, 1256 bytes spill stores, 2192 bytes spill loads ptxas info : Function properties for _Z46ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LL128v 456 bytes stack frame, 312 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_SumPostDiv_i8_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_DIRECT_SIMPLEv 872 bytes stack frame, 4092 bytes spill stores, 6572 bytes spill loads ptxas info : Function properties for _Z56ncclDevFunc_AllReduce_SumPostDiv_i8_COLLNET_CHAIN_SIMPLEv 656 bytes stack frame, 2140 bytes spill stores, 3452 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 424 bytes stack frame, 1008 bytes spill stores, 1684 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 560 bytes stack frame, 448 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 464 bytes stack frame, 344 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1236 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1184 bytes spill stores, 2124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 424 bytes stack frame, 1008 bytes spill stores, 1684 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 560 bytes stack frame, 448 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 456 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1236 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1168 bytes spill stores, 2104 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 424 bytes stack frame, 1008 bytes spill stores, 1684 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 560 bytes stack frame, 448 bytes spill stores, 680 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 464 bytes stack frame, 344 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1236 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1184 bytes spill stores, 2124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 448 bytes stack frame, 1060 bytes spill stores, 1788 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 616 bytes stack frame, 496 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 360 bytes stack frame, 556 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 488 bytes stack frame, 368 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1340 bytes spill stores, 1888 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1236 bytes spill stores, 2196 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 4927 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 528 bytes stack frame, 1488 bytes spill stores, 2384 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 552 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 472 bytes stack frame, 1000 bytes spill stores, 2252 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 1916 bytes spill stores, 3084 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 576 bytes stack frame, 1380 bytes spill stores, 2620 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 8825 bytes gmem ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_TREE_SIMPLEv 560 bytes stack frame, 1540 bytes spill stores, 2448 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_TREE_LL128v 456 bytes stack frame, 280 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_TREE_LLv 336 bytes stack frame, 516 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_RING_SIMPLEv 424 bytes stack frame, 916 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z43ncclDevFunc_AllReduce_MinMax_f16_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z40ncclDevFunc_AllReduce_MinMax_f16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z49ncclDevFunc_AllReduce_MinMax_f16_NVLS_TREE_SIMPLEv 600 bytes stack frame, 1728 bytes spill stores, 3072 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_f16_NVLS_SIMPLEv 680 bytes stack frame, 2040 bytes spill stores, 2928 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_f16_COLLNET_DIRECT_SIMPLEv 736 bytes stack frame, 2088 bytes spill stores, 3356 bytes spill loads ptxas info : Function properties for _Z53ncclDevFunc_AllReduce_MinMax_f16_COLLNET_CHAIN_SIMPLEv 576 bytes stack frame, 1428 bytes spill stores, 2596 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 424 bytes stack frame, 920 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 664 bytes stack frame, 268 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1236 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1168 bytes spill stores, 2108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 424 bytes stack frame, 920 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 664 bytes stack frame, 268 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1236 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1168 bytes spill stores, 2108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 424 bytes stack frame, 920 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 664 bytes stack frame, 268 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 552 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 312 bytes stack frame, 372 bytes spill stores, 852 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 440 bytes stack frame, 316 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1236 bytes spill stores, 1876 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1168 bytes spill stores, 2108 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 448 bytes stack frame, 976 bytes spill stores, 1684 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 696 bytes stack frame, 304 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 360 bytes stack frame, 532 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 336 bytes stack frame, 424 bytes spill stores, 928 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 488 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1412 bytes spill stores, 2032 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1240 bytes spill stores, 2184 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 592 bytes stack frame, 1560 bytes spill stores, 2424 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 536 bytes stack frame, 392 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 360 bytes stack frame, 536 bytes spill stores, 716 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 440 bytes stack frame, 864 bytes spill stores, 1768 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 512 bytes stack frame, 404 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 672 bytes stack frame, 1728 bytes spill stores, 2816 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 512 bytes stack frame, 1304 bytes spill stores, 2324 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5097 bytes gmem ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_SIMPLEv 568 bytes stack frame, 1528 bytes spill stores, 2288 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LL128v 480 bytes stack frame, 312 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_TREE_LLv 336 bytes stack frame, 504 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclDevFunc_AllReduce_PreMulSum_bf16_RING_SIMPLEv 368 bytes stack frame, 540 bytes spill stores, 1108 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_PreMulSum_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1692 bytes spill stores, 2732 bytes spill loads ptxas info : Function properties for _Z57ncclDevFunc_AllReduce_PreMulSum_bf16_COLLNET_CHAIN_SIMPLEv 520 bytes stack frame, 1264 bytes spill stores, 2364 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 296 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_50' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 424 bytes stack frame, 940 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 664 bytes stack frame, 264 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1116 bytes spill stores, 2012 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 296 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_60' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 424 bytes stack frame, 940 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 664 bytes stack frame, 264 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1116 bytes spill stores, 2012 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 136 bytes stack frame, 296 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_61' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4416 bytes cmem[0], 16 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 424 bytes stack frame, 940 bytes spill stores, 1600 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 664 bytes stack frame, 264 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 328 bytes stack frame, 548 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 304 bytes stack frame, 352 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 432 bytes stack frame, 308 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 520 bytes stack frame, 1152 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1116 bytes spill stores, 2012 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 152 bytes stack frame, 300 bytes spill stores, 496 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_70' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 448 bytes stack frame, 980 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 696 bytes stack frame, 300 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 352 bytes stack frame, 544 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 336 bytes stack frame, 416 bytes spill stores, 912 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 480 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 536 bytes stack frame, 1276 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 496 bytes stack frame, 1180 bytes spill stores, 2120 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 5037 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 144 bytes stack frame, 292 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_80' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, used 16 barriers, 4448 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 560 bytes stack frame, 1412 bytes spill stores, 2176 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 544 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 360 bytes stack frame, 528 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 400 bytes stack frame, 700 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 504 bytes stack frame, 372 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 608 bytes stack frame, 1520 bytes spill stores, 2628 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1224 bytes spill stores, 2112 bytes spill loads ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : Overriding maximum register limit 256 for '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' with 96 of maxrregcount option ptxas info : 9023 bytes gmem ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_TREE_LL24ncclDevKernelArgsStorageILm4096EE 128 bytes stack frame, 260 bytes spill stores, 444 bytes spill loads ptxas info : Used 96 registers, used 16 barriers ptxas info : Compiling entry function '_Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE' for 'sm_90' ptxas info : Function properties for _Z40ncclDevKernel_AllReduce_Sum_bf16_RING_LL24ncclDevKernelArgsStorageILm4096EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, used 16 barriers ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_TREE_SIMPLEv 552 bytes stack frame, 1356 bytes spill stores, 2168 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_TREE_LL128v 464 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_TREE_LLv 336 bytes stack frame, 512 bytes spill stores, 720 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_RING_SIMPLEv 368 bytes stack frame, 496 bytes spill stores, 1044 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_Sum_bf16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z38ncclDevFunc_AllReduce_Sum_bf16_RING_LLv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z47ncclDevFunc_AllReduce_Sum_bf16_NVLS_TREE_SIMPLEv 624 bytes stack frame, 1680 bytes spill stores, 2884 bytes spill loads ptxas info : Function properties for _Z42ncclDevFunc_AllReduce_Sum_bf16_NVLS_SIMPLEv 664 bytes stack frame, 1968 bytes spill stores, 3132 bytes spill loads ptxas info : Function properties for _Z52ncclDevFunc_AllReduce_Sum_bf16_COLLNET_DIRECT_SIMPLEv 656 bytes stack frame, 1660 bytes spill stores, 2740 bytes spill loads ptxas info : Function properties for _Z51ncclDevFunc_AllReduce_Sum_bf16_COLLNET_CHAIN_SIMPLEv 504 bytes stack frame, 1280 bytes spill stores, 2176 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' Compiling build/obj/device/gensrc/all_reduce_minmax_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 424 bytes stack frame, 924 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 664 bytes stack frame, 268 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1196 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1184 bytes spill stores, 2124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 424 bytes stack frame, 924 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 664 bytes stack frame, 268 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1196 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1184 bytes spill stores, 2124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 424 bytes stack frame, 924 bytes spill stores, 1584 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 664 bytes stack frame, 268 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 328 bytes stack frame, 552 bytes spill stores, 700 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 312 bytes stack frame, 376 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 512 bytes stack frame, 1196 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 472 bytes stack frame, 1184 bytes spill stores, 2124 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 448 bytes stack frame, 980 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 688 bytes stack frame, 296 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 360 bytes stack frame, 556 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 336 bytes stack frame, 428 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 480 bytes stack frame, 352 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 544 bytes stack frame, 1296 bytes spill stores, 1944 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 488 bytes stack frame, 1256 bytes spill stores, 2204 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 5067 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 528 bytes stack frame, 1488 bytes spill stores, 2384 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 552 bytes stack frame, 400 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 368 bytes stack frame, 536 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 472 bytes stack frame, 1000 bytes spill stores, 2252 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 496 bytes stack frame, 364 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 712 bytes stack frame, 1916 bytes spill stores, 3084 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 576 bytes stack frame, 1380 bytes spill stores, 2620 bytes spill loads ptxas info : Overriding maximum register limit 256 for '__cuda_dummy_entry__' with 96 of maxrregcount option ptxas info : 9077 bytes gmem ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_TREE_SIMPLEv 560 bytes stack frame, 1540 bytes spill stores, 2448 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_TREE_LL128v 456 bytes stack frame, 280 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_TREE_LLv 336 bytes stack frame, 516 bytes spill stores, 724 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_RING_SIMPLEv 424 bytes stack frame, 916 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z44ncclDevFunc_AllReduce_MinMax_bf16_RING_LL128v 440 bytes stack frame, 300 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclDevFunc_AllReduce_MinMax_bf16_RING_LLv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z50ncclDevFunc_AllReduce_MinMax_bf16_NVLS_TREE_SIMPLEv 600 bytes stack frame, 1728 bytes spill stores, 3072 bytes spill loads ptxas info : Function properties for _Z45ncclDevFunc_AllReduce_MinMax_bf16_NVLS_SIMPLEv 680 bytes stack frame, 2040 bytes spill stores, 2928 bytes spill loads ptxas info : Function properties for _Z55ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_DIRECT_SIMPLEv 736 bytes stack frame, 2088 bytes spill stores, 3356 bytes spill loads ptxas info : Function properties for _Z54ncclDevFunc_AllReduce_MinMax_bf16_COLLNET_CHAIN_SIMPLEv 576 bytes stack frame, 1428 bytes spill stores, 2596 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' /usr/local/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.22.3-1/build/include -I../include --compiler-options "-fPIC -fvisibility=hidden" -dlink /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/common.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/onerank.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_gather.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_i32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_i64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_prod_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sum_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/all_reduce_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/broadcast.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_prod_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_i32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_i64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_minmax_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_premulsum_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_prod_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sum_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_scatter_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_bf16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_f16.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_f32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_f64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sum_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sumpostdiv_i32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sumpostdiv_i64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sumpostdiv_i8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sumpostdiv_u32.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sumpostdiv_u64.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/reduce_sumpostdiv_u8.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/sendrecv.cu.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/host_table.cc.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/genobj/device_table.cu.o -o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/device_glue.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Archiving libnccl_static.a > /builddir/build/BUILD/nccl-2.22.3-1/build/lib/libnccl_static.a mkdir -p /builddir/build/BUILD/nccl-2.22.3-1/build/lib ar cr /builddir/build/BUILD/nccl-2.22.3-1/build/lib/libnccl_static.a /builddir/build/BUILD/nccl-2.22.3-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/collectives.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/group.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/net.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/register.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/tuner.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/generic.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enhcompat.o $(cat /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/manifest) make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.22.3-1/src' Linking libnccl.so.2.22.3 > /builddir/build/BUILD/nccl-2.22.3-1/build/lib/libnccl.so.2.22.3 mkdir -p /builddir/build/BUILD/nccl-2.22.3-1/build/lib g++ -DCUDA_MAJOR=12 -DCUDA_MINOR=6 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -shared -Wl,--no-as-needed -Wl,-soname,libnccl.so.2 -o /builddir/build/BUILD/nccl-2.22.3-1/build/lib/libnccl.so.2.22.3 /builddir/build/BUILD/nccl-2.22.3-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/collectives.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/group.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/net.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/register.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/tuner.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/generic.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.22.3-1/build/obj/enhcompat.o $(cat /builddir/build/BUILD/nccl-2.22.3-1/build/obj/device/manifest) -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 -L/usr/local/cuda/lib64 -lcudart_static -lpthread -lrt -ldl ln -sf libnccl.so.2 /builddir/build/BUILD/nccl-2.22.3-1/build/lib/libnccl.so ln -sf libnccl.so.2.22.3 /builddir/build/BUILD/nccl-2.22.3-1/build/lib/libnccl.so.2 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.22.3-1/src' + RPM_EC=0 ++ jobs -p + exit 0 Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.TbIhSg + umask 022 + cd /builddir/build/BUILD + '[' /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64 '!=' / ']' + rm -rf /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64 ++ dirname /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64 + mkdir -p /builddir/build/BUILDROOT + mkdir /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64 + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.22.3-1 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/include + mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/lib64 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/lib64/pkgconfig + cp -d build/lib/libnccl.so build/lib/libnccl.so.2 build/lib/libnccl.so.2.22.3 build/lib/libnccl_static.a /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/lib64 + cp build/include/nccl.h build/include/nccl_net.h /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/include + cp build/lib/pkgconfig/nccl.pc /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/lib64/pkgconfig/ + /usr/bin/find-debuginfo -j80 --strict-build-id -m -i --build-id-seed 2.22.3-1.cuda12.6.an23 --unique-debug-suffix -2.22.3-1.cuda12.6.an23.x86_64 --unique-debug-src-base libnccl-2.22.3-1.cuda12.6.an23.x86_64 --run-dwz --dwz-low-mem-die-limit 10000000 --dwz-max-die-limit 110000000 -S debugsourcefiles.list /builddir/build/BUILD/nccl-2.22.3-1 extracting debug info from /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/lib64/libnccl.so.2.22.3 original debug info size: 8524kB, size after compression: 7256kB /usr/bin/sepdebugcrcfix: Updated 1 CRC32s, 0 CRC32s did match. 3393 blocks + /usr/lib/rpm/check-buildroot + /usr/lib/rpm/anolis/brp-ldconfig + COMPRESS='zstd -f --rm -19 -T0' + COMPRESS_EXT=.zst + /usr/lib/rpm/brp-compress + /usr/lib/rpm/anolis/brp-strip-lto /usr/bin/strip + /usr/lib/rpm/brp-strip-static-archive /usr/bin/strip + /usr/lib/rpm/check-rpaths + /usr/lib/rpm/brp-remove-la-files + /usr/lib/rpm/anolis/clean_perl + /usr/lib/rpm/anolis/check_elf_files + /usr/lib/rpm/anolis/brp-mangle-shebangs + /usr/lib/rpm/anolis/remove-info-dir + /usr/lib/rpm/anolis/check-desktop-files + /usr/lib/rpm/anolis/brp-python-bytecompile '' 1 0 + /usr/lib/rpm/anolis/brp-python-hardlink Processing files: libnccl-2.22.3-1.cuda12.6.an23.x86_64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.djhuiQ + umask 022 + cd /builddir/build/BUILD + cd nccl-2.22.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl = 2.22.3-1.cuda12.6.an23 libnccl(x86-64) = 2.22.3-1.cuda12.6.an23 libnccl.so.2()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: ld-linux-x86-64.so.2()(64bit) ld-linux-x86-64.so.2(GLIBC_2.3)(64bit) libc.so.6()(64bit) libc.so.6(GLIBC_2.10)(64bit) libc.so.6(GLIBC_2.14)(64bit) libc.so.6(GLIBC_2.17)(64bit) libc.so.6(GLIBC_2.2.5)(64bit) libc.so.6(GLIBC_2.3)(64bit) libc.so.6(GLIBC_2.3.2)(64bit) libc.so.6(GLIBC_2.3.3)(64bit) libc.so.6(GLIBC_2.3.4)(64bit) libc.so.6(GLIBC_2.33)(64bit) libc.so.6(GLIBC_2.34)(64bit) libc.so.6(GLIBC_2.4)(64bit) libc.so.6(GLIBC_2.6)(64bit) libc.so.6(GLIBC_2.7)(64bit) libgcc_s.so.1()(64bit) libgcc_s.so.1(GCC_3.0)(64bit) libm.so.6()(64bit) libm.so.6(GLIBC_2.2.5)(64bit) libstdc++.so.6()(64bit) libstdc++.so.6(CXXABI_1.3)(64bit) libstdc++.so.6(GLIBCXX_3.4)(64bit) libstdc++.so.6(GLIBCXX_3.4.11)(64bit) libstdc++.so.6(GLIBCXX_3.4.19)(64bit) rtld(GNU_HASH) Processing files: libnccl-devel-2.22.3-1.cuda12.6.an23.x86_64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.3xY4hr + umask 022 + cd /builddir/build/BUILD + cd nccl-2.22.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl-devel + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl-devel + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl-devel + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-devel = 2.22.3-1.cuda12.6.an23 libnccl-devel(x86-64) = 2.22.3-1.cuda12.6.an23 pkgconfig(nccl) = 2.22.3 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: /usr/bin/pkg-config libnccl.so.2()(64bit) Processing files: libnccl-static-2.22.3-1.cuda12.6.an23.x86_64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.N6O3cY + umask 022 + cd /builddir/build/BUILD + cd nccl-2.22.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl-static + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl-static + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64/usr/share/licenses/libnccl-static + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-static = 2.22.3-1.cuda12.6.an23 libnccl-static(x86-64) = 2.22.3-1.cuda12.6.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-debugsource-2.22.3-1.cuda12.6.an23.x86_64 Provides: libnccl-debugsource = 2.22.3-1.cuda12.6.an23 libnccl-debugsource(x86-64) = 2.22.3-1.cuda12.6.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-debuginfo-2.22.3-1.cuda12.6.an23.x86_64 Provides: debuginfo(build-id) = 646a85cb569e8f059682ee1752dcbac1d7832548 libnccl-debuginfo = 2.22.3-1.cuda12.6.an23 libnccl-debuginfo(x86-64) = 2.22.3-1.cuda12.6.an23 libnccl.so.2.22.3-2.22.3-1.cuda12.6.an23.x86_64.debug()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Recommends: libnccl-debugsource(x86-64) = 2.22.3-1.cuda12.6.an23 Checking for unpackaged file(s): /usr/lib/rpm/check-files /builddir/build/BUILDROOT/libnccl-2.22.3-1.cuda12.6.an23.x86_64 Wrote: /builddir/build/RPMS/libnccl-devel-2.22.3-1.cuda12.6.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-debugsource-2.22.3-1.cuda12.6.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-debuginfo-2.22.3-1.cuda12.6.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-2.22.3-1.cuda12.6.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-static-2.22.3-1.cuda12.6.an23.x86_64.rpm Child return code was: 0