Cycles: Compress GPU kernels to reduce file size

Precompiled Cycles kernels make up a considerable fraction of the total size of
Blender builds nowadays. As we add more features and support for more
architectures, this will only continue to increase.

However, since these kernels tend to be quite compressible, we can save a lot
of storage by storing them in compressed form and decompressing the required
kernel(s) during loading.

By using Zstandard compression with a high level, we can get decent compression
ratios (~5x for the current kernels) while keeping decompression time low
(about 30ms in the worse case in my tests). And since we already require zstd
for Blender, this doesn't introduce a new dependency.

While the main improvement is to the size of the extracted Blender installation
(which is reduced by ~400-500MB currently), this also shrinks the download on
Windows, since .zip's deflate compression is less effective. It doesn't help on
Linux since we're already using .tar.xz there, but the smaller installed size
is still a good thing.

See #123522 for initial discussion.

Pull Request: https://projects.blender.org/blender/blender/pulls/123557
This commit is contained in:
Lukas Stockner 2024-06-23 00:52:30 +02:00 committed by Lukas Stockner
parent cf73897690
commit 4bde68cdd6
9 changed files with 152 additions and 20 deletions

View file

@ -0,0 +1,54 @@
/* SPDX-FileCopyrightText: 2024 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#include <cstdint>
#include <fstream>
#include <vector>
#include <zstd.h>
int main(int argc, const char **argv)
{
if (argc < 3) {
return -1;
}
/* TODO: This might fail for non-ASCII paths on Windows... */
std::ifstream in(argv[1], std::ios_base::binary);
std::ofstream out(argv[2], std::ios_base::binary);
if (!in || !out) {
return -1;
}
in.seekg(0, std::ios_base::end);
size_t in_size = in.tellg();
in.seekg(0, std::ios_base::beg);
if (!in) {
return -1;
}
std::vector<char> in_data(in_size);
in.read(in_data.data(), in_size);
if (!in) {
return -1;
}
size_t out_size = ZSTD_compressBound(in_size);
if (ZSTD_isError(out_size)) {
return -1;
}
std::vector<char> out_data(out_size);
out_size = ZSTD_compress(out_data.data(), out_data.size(), in_data.data(), in_data.size(), 19);
if (ZSTD_isError(out_size)) {
return -1;
}
out.write(out_data.data(), out_size);
if (!out) {
return -1;
}
return 0;
}

View file

@ -256,7 +256,7 @@ string CUDADevice::compile_kernel(const string &common_cflags,
/* Attempt to use kernel provided with Blender. */
if (!use_adaptive_compilation()) {
if (!force_ptx) {
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin.zst", name, major, minor));
VLOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
if (path_exists(cubin)) {
VLOG_INFO << "Using precompiled kernel.";
@ -268,7 +268,7 @@ string CUDADevice::compile_kernel(const string &common_cflags,
int ptx_major = major, ptx_minor = minor;
while (ptx_major >= 3) {
const string ptx = path_get(
string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
string_printf("lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
VLOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
if (path_exists(ptx)) {
VLOG_INFO << "Using precompiled kernel.";
@ -440,7 +440,7 @@ bool CUDADevice::load_kernels(const uint kernel_features)
string cubin_data;
CUresult result;
if (path_read_text(cubin, cubin_data)) {
if (path_read_compressed_text(cubin, cubin_data)) {
result = cuModuleLoadData(&cuModule, cubin_data.c_str());
}
else {

View file

@ -231,7 +231,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
/* Attempt to use kernel provided with Blender. */
if (!use_adaptive_compilation()) {
const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch.c_str()));
const string fatbin = path_get(string_printf("lib/%s_%s.fatbin.zst", name, arch.c_str()));
VLOG_INFO << "Testing for pre-compiled kernel " << fatbin << ".";
if (path_exists(fatbin)) {
VLOG_INFO << "Using precompiled kernel.";
@ -387,7 +387,7 @@ bool HIPDevice::load_kernels(const uint kernel_features)
string fatbin_data;
hipError_t result;
if (path_read_text(fatbin, fatbin_data))
if (path_read_compressed_text(fatbin, fatbin_data))
result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
else
result = hipErrorFileNotFound;

View file

@ -141,7 +141,7 @@ string HIPRTDevice::compile_kernel(const uint kernel_features, const char *name,
const std::string arch = hipDeviceArch(hipDevId);
if (!use_adaptive_compilation()) {
const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb", name));
const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb.zst", name));
VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
if (path_exists(fatbin)) {
VLOG(1) << "Using precompiled kernel.";
@ -309,8 +309,7 @@ bool HIPRTDevice::load_kernels(const uint kernel_features)
string fatbin_data;
hipError_t result;
if (path_read_text(fatbin, fatbin_data)) {
if (path_read_compressed_text(fatbin, fatbin_data)) {
result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
}
else

View file

@ -216,7 +216,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
"";
string ptx_filename;
if (need_optix_kernels) {
ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx.zst");
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
std::string optix_include_dir = get_optix_include_dir();
if (optix_include_dir.empty()) {
@ -348,7 +348,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
string cflags = compile_kernel_get_common_cflags(kernel_features);
ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
}
if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
if (ptx_filename.empty() || !path_read_compressed_text(ptx_filename, ptx_data)) {
set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
return false;
}
@ -798,8 +798,8 @@ bool OptiXDevice::load_osl_kernels()
osl_modules.resize(osl_kernels.size() + 1);
{ /* Load and compile PTX module with OSL services. */
string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
if (!path_read_text(ptx_filename, ptx_data)) {
string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx.zst");
if (!path_read_compressed_text(ptx_filename, ptx_data)) {
set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
ptx_filename.c_str()));
return false;

View file

@ -416,6 +416,11 @@ set(LIB
)
# Zstd compressor for kernels
add_executable(zstd_compress ../cmake/zstd_compress.cpp)
target_include_directories(zstd_compress SYSTEM PRIVATE ${ZSTD_INCLUDE_DIRS})
target_link_libraries(zstd_compress ${ZSTD_LIBRARIES} ${PTHREADS_LIBRARIES})
# CUDA module
if(WITH_CYCLES_CUDA_BINARIES)
@ -455,6 +460,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(format "cubin")
endif()
set(cuda_file ${name}_${arch}.${format})
set(cuda_file_compressed ${cuda_file}.zst)
set(kernel_sources ${sources})
if(NOT ${prev_arch} STREQUAL "none")
@ -517,9 +523,14 @@ if(WITH_CYCLES_CUDA_BINARIES)
DEPENDS ${kernel_sources})
endif()
add_custom_command(
OUTPUT ${cuda_file_compressed}
COMMAND "$<TARGET_FILE:zstd_compress>" ${cuda_file} ${cuda_file_compressed}
DEPENDS ${cuda_file})
unset(_cuda_nvcc_args)
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_file})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file_compressed}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_file_compressed})
unset(cuda_debug_flags)
endmacro()
@ -603,6 +614,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
macro(CYCLES_HIP_KERNEL_ADD arch name flags sources experimental)
set(format "fatbin")
set(hip_file ${name}_${arch}.${format})
set(hip_file_compressed ${hip_file}.zst)
set(kernel_sources ${sources})
set(hip_kernel_src "/device/hip/${name}.cpp")
@ -657,8 +669,12 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
OUTPUT ${hip_file}
COMMAND ${hip_command} ${hip_flags}
DEPENDS ${kernel_sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hip_file}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND hip_fatbins ${hip_file})
add_custom_command(
OUTPUT ${hip_file_compressed}
COMMAND "$<TARGET_FILE:zstd_compress>" ${hip_file} ${hip_file_compressed}
DEPENDS ${hip_file})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hip_file_compressed}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND hip_fatbins ${hip_file_compressed})
endmacro()
foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
@ -680,6 +696,7 @@ if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES)
${SRC_UTIL_HEADERS})
set(bitcode_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.bc)
set(hiprt_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.hipfb)
set(hiprt_file_compressed ${hiprt_file}.zst)
set(kernel_sources ${hiprt_sources})
set(hiprt_kernel_src "/device/hiprt/kernel.cpp")
if(WIN32)
@ -744,8 +761,12 @@ if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES)
OUTPUT ${hiprt_file}
COMMAND ${hiprt_link_command} ${hiprt_link_flags}
DEPENDS ${bitcode_file})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file}" ${CYCLES_INSTALL_PATH}/lib)
add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file})
add_custom_command(
OUTPUT ${hiprt_file_compressed}
COMMAND "$<TARGET_FILE:zstd_compress>" ${hiprt_file} ${hiprt_file_compressed}
DEPENDS ${hiprt_file})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file_compressed}" ${CYCLES_INSTALL_PATH}/lib)
add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file_compressed})
cycles_set_solution_folder(cycles_kernel_hiprt)
endif()
@ -754,6 +775,7 @@ endif()
if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
macro(cycles_optix_kernel_add name input flags)
set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
set(output_compressed "${output}.zst")
set(cuda_flags ${flags}
-I "${OPTIX_INCLUDE_DIR}"
@ -795,9 +817,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
WORKING_DIRECTORY
"${CMAKE_CURRENT_SOURCE_DIR}")
list(APPEND optix_ptx ${output})
add_custom_command(
OUTPUT ${output_compressed}
COMMAND "$<TARGET_FILE:zstd_compress>" ${output} ${output_compressed}
DEPENDS ${output})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND optix_ptx ${output_compressed})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output_compressed}" ${CYCLES_INSTALL_PATH}/lib)
endmacro()
cycles_optix_kernel_add(

View file

@ -7,6 +7,7 @@ set(INC
)
set(INC_SYS
${ZSTD_INCLUDE_DIRS}
)
set(SRC
@ -32,6 +33,7 @@ set(SRC
set(LIB
${TBB_LIBRARIES}
${ZSTD_LIBRARIES}
)
set(SRC_HEADERS

View file

@ -19,6 +19,8 @@ OIIO_NAMESPACE_USING
#include <sys/stat.h>
#include <zstd.h>
#if defined(_WIN32)
# define DIR_SEP '\\'
# define DIR_SEP_ALT '/'
@ -704,6 +706,36 @@ bool path_read_binary(const string &path, vector<uint8_t> &binary)
return true;
}
bool path_read_compressed_binary(const string &path, vector<uint8_t> &binary)
{
if (!string_endswith(path, ".zst")) {
return path_read_binary(path, binary);
}
vector<uint8_t> compressed;
if (!path_read_binary(path, compressed)) {
return false;
}
const size_t full_size = ZSTD_getFrameContentSize(compressed.data(), compressed.size());
if (full_size == ZSTD_CONTENTSIZE_ERROR) {
/* Potentially corrupted file? */
return false;
}
if (full_size == ZSTD_CONTENTSIZE_UNKNOWN) {
/* Technically this is an optional field, but we can expect it to be set for now.
* Otherwise we'd need streaming decompression and repeated resizing of the vector. */
return false;
}
binary.resize(full_size);
size_t err = ZSTD_decompress(binary.data(), binary.size(), compressed.data(), compressed.size());
return ZSTD_isError(err) == 0;
}
bool path_read_text(const string &path, string &text)
{
vector<uint8_t> binary;
@ -719,6 +751,21 @@ bool path_read_text(const string &path, string &text)
return true;
}
bool path_read_compressed_text(const string &path, string &text)
{
vector<uint8_t> binary;
if (!path_exists(path) || !path_read_compressed_binary(path, binary)) {
return false;
}
const char *str = (const char *)&binary[0];
size_t size = binary.size();
text = string(str, size);
return true;
}
uint64_t path_modified_time(const string &path)
{
path_stat_t st;

View file

@ -50,6 +50,9 @@ bool path_write_text(const string &path, string &text);
bool path_read_binary(const string &path, vector<uint8_t> &binary);
bool path_read_text(const string &path, string &text);
bool path_read_compressed_binary(const string &path, vector<uint8_t> &binary);
bool path_read_compressed_text(const string &path, string &text);
/* File manipulation. */
bool path_remove(const string &path);