1 year ago

#374752

test-img

Steve Cox

Can nvcc generate an older PTX ISA version

When I compile (nvcc -ptx axpy) a short example kernel with nvcc in CUDA toolkit 11.4:

__global__ void axpy(float a, float* x, float* y) {
  y[threadIdx.x] = a * x[threadIdx.x];
}

I get this ptx:

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-30033411
// Cuda compilation tools, release 11.4, V11.4.48
// Based on NVVM 7.0.1
//

.version 7.4
.target sm_52
.address_size 64

        // .globl       _Z4axpyfPfS_

.visible .entry _Z4axpyfPfS_(
        .param .f32 _Z4axpyfPfS__param_0,
        .param .u64 _Z4axpyfPfS__param_1,
        .param .u64 _Z4axpyfPfS__param_2
)
{
        .reg .f32       %f<4>;
        .reg .b32       %r<2>;
        .reg .b64       %rd<8>;


        ld.param.f32    %f1, [_Z4axpyfPfS__param_0];
        ld.param.u64    %rd1, [_Z4axpyfPfS__param_1];
        ld.param.u64    %rd2, [_Z4axpyfPfS__param_2];
        cvta.to.global.u64      %rd3, %rd2;
        cvta.to.global.u64      %rd4, %rd1;
        mov.u32         %r1, %tid.x;
        mul.wide.u32    %rd5, %r1, 4;
        add.s64         %rd6, %rd4, %rd5;
        ld.global.f32   %f2, [%rd6];
        mul.f32         %f3, %f2, %f1;
        add.s64         %rd7, %rd3, %rd5;
        st.global.f32   [%rd7], %f3;
        ret;

}

Which of course fails when I send it to an old CUDA driver with the error CUDA_ERROR_UNSUPPORTED_PTX_VERSION: the provided PTX was compiled with an unsupported toolchain because of the .version 7.4 right at the beginning despite the kernel not using any recent ptx features.

If instead I compile with clang (clang++ --cuda-device-only --cuda-gpu-arch=sm_52 -nocudalib -S -O3 axpy.cu) it produces a more conservative .version that works on just about any driver I could hope for:

//
// Generated by LLVM NVPTX Back-End
//

.version 4.1
.target sm_52
.address_size 64

        // .globl       _Z4axpyfPfS_

.visible .entry _Z4axpyfPfS_(
        .param .f32 _Z4axpyfPfS__param_0,
        .param .u64 _Z4axpyfPfS__param_1,
        .param .u64 _Z4axpyfPfS__param_2
)
{
        .reg .f32       %f<4>;
        .reg .b32       %r<2>;
        .reg .b64       %rd<8>;

        ld.param.f32    %f1, [_Z4axpyfPfS__param_0];
        ld.param.u64    %rd1, [_Z4axpyfPfS__param_2];
        cvta.to.global.u64      %rd2, %rd1;
        ld.param.u64    %rd3, [_Z4axpyfPfS__param_1];
        cvta.to.global.u64      %rd4, %rd3;
        mov.u32         %r1, %tid.x;
        mul.wide.u32    %rd5, %r1, 4;
        add.s64         %rd6, %rd4, %rd5;
        ld.global.f32   %f2, [%rd6];
        mul.f32         %f3, %f2, %f1;
        add.s64         %rd7, %rd2, %rd5;
        st.global.f32   [%rd7], %f3;
        ret;

}

Is there a way to tell nvcc that I'd like a less modern PTX version in the generated PTX to improve the driver compatibility of the generated kernels?

c++

cuda

ptx

0 Answers

Your Answer

Accepted video resources