mirror of
https://projects.blender.org/blender/blender.git
synced 2025-01-22 15:32:15 -05:00
Cycles: Compress GPU kernels to reduce file size
Precompiled Cycles kernels make up a considerable fraction of the total size of Blender builds nowadays. As we add more features and support for more architectures, this will only continue to increase. However, since these kernels tend to be quite compressible, we can save a lot of storage by storing them in compressed form and decompressing the required kernel(s) during loading. By using Zstandard compression with a high level, we can get decent compression ratios (~5x for the current kernels) while keeping decompression time low (about 30ms in the worse case in my tests). And since we already require zstd for Blender, this doesn't introduce a new dependency. While the main improvement is to the size of the extracted Blender installation (which is reduced by ~400-500MB currently), this also shrinks the download on Windows, since .zip's deflate compression is less effective. It doesn't help on Linux since we're already using .tar.xz there, but the smaller installed size is still a good thing. See #123522 for initial discussion. Pull Request: https://projects.blender.org/blender/blender/pulls/123557
This commit is contained in:
parent
cf73897690
commit
4bde68cdd6
9 changed files with 152 additions and 20 deletions
54
intern/cycles/cmake/zstd_compress.cpp
Normal file
54
intern/cycles/cmake/zstd_compress.cpp
Normal file
|
@ -0,0 +1,54 @@
|
|||
/* SPDX-FileCopyrightText: 2024 Blender Foundation
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 */
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#include <zstd.h>
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
if (argc < 3) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* TODO: This might fail for non-ASCII paths on Windows... */
|
||||
std::ifstream in(argv[1], std::ios_base::binary);
|
||||
std::ofstream out(argv[2], std::ios_base::binary);
|
||||
if (!in || !out) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
in.seekg(0, std::ios_base::end);
|
||||
size_t in_size = in.tellg();
|
||||
in.seekg(0, std::ios_base::beg);
|
||||
if (!in) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<char> in_data(in_size);
|
||||
in.read(in_data.data(), in_size);
|
||||
if (!in) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t out_size = ZSTD_compressBound(in_size);
|
||||
if (ZSTD_isError(out_size)) {
|
||||
return -1;
|
||||
}
|
||||
std::vector<char> out_data(out_size);
|
||||
|
||||
out_size = ZSTD_compress(out_data.data(), out_data.size(), in_data.data(), in_data.size(), 19);
|
||||
if (ZSTD_isError(out_size)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
out.write(out_data.data(), out_size);
|
||||
if (!out) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -256,7 +256,7 @@ string CUDADevice::compile_kernel(const string &common_cflags,
|
|||
/* Attempt to use kernel provided with Blender. */
|
||||
if (!use_adaptive_compilation()) {
|
||||
if (!force_ptx) {
|
||||
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
|
||||
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin.zst", name, major, minor));
|
||||
VLOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
|
||||
if (path_exists(cubin)) {
|
||||
VLOG_INFO << "Using precompiled kernel.";
|
||||
|
@ -268,7 +268,7 @@ string CUDADevice::compile_kernel(const string &common_cflags,
|
|||
int ptx_major = major, ptx_minor = minor;
|
||||
while (ptx_major >= 3) {
|
||||
const string ptx = path_get(
|
||||
string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
|
||||
string_printf("lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
|
||||
VLOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
|
||||
if (path_exists(ptx)) {
|
||||
VLOG_INFO << "Using precompiled kernel.";
|
||||
|
@ -440,7 +440,7 @@ bool CUDADevice::load_kernels(const uint kernel_features)
|
|||
string cubin_data;
|
||||
CUresult result;
|
||||
|
||||
if (path_read_text(cubin, cubin_data)) {
|
||||
if (path_read_compressed_text(cubin, cubin_data)) {
|
||||
result = cuModuleLoadData(&cuModule, cubin_data.c_str());
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -231,7 +231,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
|
|||
|
||||
/* Attempt to use kernel provided with Blender. */
|
||||
if (!use_adaptive_compilation()) {
|
||||
const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch.c_str()));
|
||||
const string fatbin = path_get(string_printf("lib/%s_%s.fatbin.zst", name, arch.c_str()));
|
||||
VLOG_INFO << "Testing for pre-compiled kernel " << fatbin << ".";
|
||||
if (path_exists(fatbin)) {
|
||||
VLOG_INFO << "Using precompiled kernel.";
|
||||
|
@ -387,7 +387,7 @@ bool HIPDevice::load_kernels(const uint kernel_features)
|
|||
string fatbin_data;
|
||||
hipError_t result;
|
||||
|
||||
if (path_read_text(fatbin, fatbin_data))
|
||||
if (path_read_compressed_text(fatbin, fatbin_data))
|
||||
result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
|
||||
else
|
||||
result = hipErrorFileNotFound;
|
||||
|
|
|
@ -141,7 +141,7 @@ string HIPRTDevice::compile_kernel(const uint kernel_features, const char *name,
|
|||
const std::string arch = hipDeviceArch(hipDevId);
|
||||
|
||||
if (!use_adaptive_compilation()) {
|
||||
const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb", name));
|
||||
const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb.zst", name));
|
||||
VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
|
||||
if (path_exists(fatbin)) {
|
||||
VLOG(1) << "Using precompiled kernel.";
|
||||
|
@ -309,8 +309,7 @@ bool HIPRTDevice::load_kernels(const uint kernel_features)
|
|||
string fatbin_data;
|
||||
hipError_t result;
|
||||
|
||||
if (path_read_text(fatbin, fatbin_data)) {
|
||||
|
||||
if (path_read_compressed_text(fatbin, fatbin_data)) {
|
||||
result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
|
||||
}
|
||||
else
|
||||
|
|
|
@ -216,7 +216,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
|
|||
"";
|
||||
string ptx_filename;
|
||||
if (need_optix_kernels) {
|
||||
ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
|
||||
ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx.zst");
|
||||
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
|
||||
std::string optix_include_dir = get_optix_include_dir();
|
||||
if (optix_include_dir.empty()) {
|
||||
|
@ -348,7 +348,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
|
|||
string cflags = compile_kernel_get_common_cflags(kernel_features);
|
||||
ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
|
||||
}
|
||||
if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
|
||||
if (ptx_filename.empty() || !path_read_compressed_text(ptx_filename, ptx_data)) {
|
||||
set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
@ -798,8 +798,8 @@ bool OptiXDevice::load_osl_kernels()
|
|||
osl_modules.resize(osl_kernels.size() + 1);
|
||||
|
||||
{ /* Load and compile PTX module with OSL services. */
|
||||
string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
|
||||
if (!path_read_text(ptx_filename, ptx_data)) {
|
||||
string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx.zst");
|
||||
if (!path_read_compressed_text(ptx_filename, ptx_data)) {
|
||||
set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
|
||||
ptx_filename.c_str()));
|
||||
return false;
|
||||
|
|
|
@ -416,6 +416,11 @@ set(LIB
|
|||
|
||||
)
|
||||
|
||||
# Zstd compressor for kernels
|
||||
add_executable(zstd_compress ../cmake/zstd_compress.cpp)
|
||||
target_include_directories(zstd_compress SYSTEM PRIVATE ${ZSTD_INCLUDE_DIRS})
|
||||
target_link_libraries(zstd_compress ${ZSTD_LIBRARIES} ${PTHREADS_LIBRARIES})
|
||||
|
||||
# CUDA module
|
||||
|
||||
if(WITH_CYCLES_CUDA_BINARIES)
|
||||
|
@ -455,6 +460,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
|||
set(format "cubin")
|
||||
endif()
|
||||
set(cuda_file ${name}_${arch}.${format})
|
||||
set(cuda_file_compressed ${cuda_file}.zst)
|
||||
|
||||
set(kernel_sources ${sources})
|
||||
if(NOT ${prev_arch} STREQUAL "none")
|
||||
|
@ -517,9 +523,14 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
|||
DEPENDS ${kernel_sources})
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${cuda_file_compressed}
|
||||
COMMAND "$<TARGET_FILE:zstd_compress>" ${cuda_file} ${cuda_file_compressed}
|
||||
DEPENDS ${cuda_file})
|
||||
|
||||
unset(_cuda_nvcc_args)
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
list(APPEND cuda_cubins ${cuda_file})
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file_compressed}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
list(APPEND cuda_cubins ${cuda_file_compressed})
|
||||
|
||||
unset(cuda_debug_flags)
|
||||
endmacro()
|
||||
|
@ -603,6 +614,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
|
|||
macro(CYCLES_HIP_KERNEL_ADD arch name flags sources experimental)
|
||||
set(format "fatbin")
|
||||
set(hip_file ${name}_${arch}.${format})
|
||||
set(hip_file_compressed ${hip_file}.zst)
|
||||
set(kernel_sources ${sources})
|
||||
|
||||
set(hip_kernel_src "/device/hip/${name}.cpp")
|
||||
|
@ -657,8 +669,12 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
|
|||
OUTPUT ${hip_file}
|
||||
COMMAND ${hip_command} ${hip_flags}
|
||||
DEPENDS ${kernel_sources})
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hip_file}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
list(APPEND hip_fatbins ${hip_file})
|
||||
add_custom_command(
|
||||
OUTPUT ${hip_file_compressed}
|
||||
COMMAND "$<TARGET_FILE:zstd_compress>" ${hip_file} ${hip_file_compressed}
|
||||
DEPENDS ${hip_file})
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hip_file_compressed}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
list(APPEND hip_fatbins ${hip_file_compressed})
|
||||
endmacro()
|
||||
|
||||
foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
|
||||
|
@ -680,6 +696,7 @@ if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES)
|
|||
${SRC_UTIL_HEADERS})
|
||||
set(bitcode_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.bc)
|
||||
set(hiprt_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.hipfb)
|
||||
set(hiprt_file_compressed ${hiprt_file}.zst)
|
||||
set(kernel_sources ${hiprt_sources})
|
||||
set(hiprt_kernel_src "/device/hiprt/kernel.cpp")
|
||||
if(WIN32)
|
||||
|
@ -744,8 +761,12 @@ if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES)
|
|||
OUTPUT ${hiprt_file}
|
||||
COMMAND ${hiprt_link_command} ${hiprt_link_flags}
|
||||
DEPENDS ${bitcode_file})
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file})
|
||||
add_custom_command(
|
||||
OUTPUT ${hiprt_file_compressed}
|
||||
COMMAND "$<TARGET_FILE:zstd_compress>" ${hiprt_file} ${hiprt_file_compressed}
|
||||
DEPENDS ${hiprt_file})
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file_compressed}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file_compressed})
|
||||
cycles_set_solution_folder(cycles_kernel_hiprt)
|
||||
endif()
|
||||
|
||||
|
@ -754,6 +775,7 @@ endif()
|
|||
if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
|
||||
macro(cycles_optix_kernel_add name input flags)
|
||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
|
||||
set(output_compressed "${output}.zst")
|
||||
|
||||
set(cuda_flags ${flags}
|
||||
-I "${OPTIX_INCLUDE_DIR}"
|
||||
|
@ -795,9 +817,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
|
|||
WORKING_DIRECTORY
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
list(APPEND optix_ptx ${output})
|
||||
add_custom_command(
|
||||
OUTPUT ${output_compressed}
|
||||
COMMAND "$<TARGET_FILE:zstd_compress>" ${output} ${output_compressed}
|
||||
DEPENDS ${output})
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
list(APPEND optix_ptx ${output_compressed})
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output_compressed}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
endmacro()
|
||||
|
||||
cycles_optix_kernel_add(
|
||||
|
|
|
@ -7,6 +7,7 @@ set(INC
|
|||
)
|
||||
|
||||
set(INC_SYS
|
||||
${ZSTD_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
set(SRC
|
||||
|
@ -32,6 +33,7 @@ set(SRC
|
|||
|
||||
set(LIB
|
||||
${TBB_LIBRARIES}
|
||||
${ZSTD_LIBRARIES}
|
||||
)
|
||||
|
||||
set(SRC_HEADERS
|
||||
|
|
|
@ -19,6 +19,8 @@ OIIO_NAMESPACE_USING
|
|||
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <zstd.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
# define DIR_SEP '\\'
|
||||
# define DIR_SEP_ALT '/'
|
||||
|
@ -704,6 +706,36 @@ bool path_read_binary(const string &path, vector<uint8_t> &binary)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool path_read_compressed_binary(const string &path, vector<uint8_t> &binary)
|
||||
{
|
||||
if (!string_endswith(path, ".zst")) {
|
||||
return path_read_binary(path, binary);
|
||||
}
|
||||
|
||||
vector<uint8_t> compressed;
|
||||
if (!path_read_binary(path, compressed)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t full_size = ZSTD_getFrameContentSize(compressed.data(), compressed.size());
|
||||
|
||||
if (full_size == ZSTD_CONTENTSIZE_ERROR) {
|
||||
/* Potentially corrupted file? */
|
||||
return false;
|
||||
}
|
||||
if (full_size == ZSTD_CONTENTSIZE_UNKNOWN) {
|
||||
/* Technically this is an optional field, but we can expect it to be set for now.
|
||||
* Otherwise we'd need streaming decompression and repeated resizing of the vector. */
|
||||
return false;
|
||||
}
|
||||
|
||||
binary.resize(full_size);
|
||||
|
||||
size_t err = ZSTD_decompress(binary.data(), binary.size(), compressed.data(), compressed.size());
|
||||
|
||||
return ZSTD_isError(err) == 0;
|
||||
}
|
||||
|
||||
bool path_read_text(const string &path, string &text)
|
||||
{
|
||||
vector<uint8_t> binary;
|
||||
|
@ -719,6 +751,21 @@ bool path_read_text(const string &path, string &text)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool path_read_compressed_text(const string &path, string &text)
|
||||
{
|
||||
vector<uint8_t> binary;
|
||||
|
||||
if (!path_exists(path) || !path_read_compressed_binary(path, binary)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *str = (const char *)&binary[0];
|
||||
size_t size = binary.size();
|
||||
text = string(str, size);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t path_modified_time(const string &path)
|
||||
{
|
||||
path_stat_t st;
|
||||
|
|
|
@ -50,6 +50,9 @@ bool path_write_text(const string &path, string &text);
|
|||
bool path_read_binary(const string &path, vector<uint8_t> &binary);
|
||||
bool path_read_text(const string &path, string &text);
|
||||
|
||||
bool path_read_compressed_binary(const string &path, vector<uint8_t> &binary);
|
||||
bool path_read_compressed_text(const string &path, string &text);
|
||||
|
||||
/* File manipulation. */
|
||||
bool path_remove(const string &path);
|
||||
|
||||
|
|
Loading…
Reference in a new issue