/* * Copyright (c) 2023, Tim Schumacher * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include #include #include #include #include namespace Compress { // This implementation is mostly based on the LZMA specification contained in the 7-Zip SDK, which has been placed in the public domain. // LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z struct LzmaModelProperties { u8 literal_context_bits; u8 literal_position_bits; u8 position_bits; }; struct LzmaDecompressorOptions { u8 literal_context_bits { 0 }; u8 literal_position_bits { 0 }; u8 position_bits { 0 }; u32 dictionary_size { 0 }; Optional uncompressed_size; bool reject_end_of_stream_marker { false }; }; struct LzmaCompressorOptions { // Note: The default settings have been chosen based on the default settings of other LZMA compressors. u8 literal_context_bits { 3 }; u8 literal_position_bits { 0 }; u8 position_bits { 2 }; u32 dictionary_size { 8 * MiB }; Optional uncompressed_size {}; }; // Described in section "lzma file format". struct [[gnu::packed]] LzmaHeader { u32 dictionary_size() const; Optional uncompressed_size() const; ErrorOr as_decompressor_options() const; static ErrorOr from_compressor_options(LzmaCompressorOptions const&); static ErrorOr decode_model_properties(u8 input_bits); static ErrorOr encode_model_properties(LzmaModelProperties const&); u8 encoded_model_properties; u32 unchecked_dictionary_size; u64 encoded_uncompressed_size; static constexpr u64 placeholder_for_unknown_uncompressed_size = UINT64_MAX; }; static_assert(sizeof(LzmaHeader) == 13); class LzmaState { protected: // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables. // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048). // The default probability for most counters is 0.5. using Probability = u16; static constexpr size_t probability_bit_count = 11; static constexpr Probability default_probability = (1 << probability_bit_count) / 2; static void initialize_to_default_probability(Span); // The significance of the shift width is not explained and appears to be a magic constant. static constexpr size_t probability_shift_width = 5; // "The value of the "Range" variable before each bit decoding can not be smaller than ((UInt32)1 << 24)." static constexpr u32 minimum_range_value = 1 << 24; LzmaState(FixedArray literal_probabilities); u64 m_total_processed_bytes { 0 }; static constexpr size_t literal_probability_table_size = 0x300; FixedArray m_literal_probabilities; struct LzmaLengthCoderState { public: LzmaLengthCoderState(); Probability m_first_choice_probability { default_probability }; Probability m_second_choice_probability { default_probability }; static constexpr size_t maximum_number_of_position_bits = 4; Array, (1 << maximum_number_of_position_bits)> m_low_length_probabilities; Array, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities; Array m_high_length_probabilities; }; LzmaLengthCoderState m_length_coder; LzmaLengthCoderState m_rep_length_coder; static constexpr u16 normalized_to_real_match_length_offset = 2; static constexpr u32 normalized_to_real_match_distance_offset = 1; // According to the specification, the largest possible normalized match length is provided by the high coder, // which processes 8 bits (0 to 255) and adds a displacement of 16 on top. // This is the minimum size that our input buffer has to have to not miss any possible repetitions while encoding. static constexpr u16 largest_real_match_length = 255 + 16 + normalized_to_real_match_length_offset; static constexpr u32 end_of_stream_marker = 0xFFFFFFFF; static constexpr size_t number_of_length_to_position_states = 4; Array, number_of_length_to_position_states> m_length_to_position_states; static constexpr size_t first_position_slot_with_binary_tree_bits = 4; static constexpr size_t first_position_slot_with_direct_encoded_bits = 14; // This is a bit wasteful on memory and not in the specification, but it makes the math easier. static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits; static constexpr size_t largest_number_of_binary_tree_distance_bits = 5; Array, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities; static constexpr size_t number_of_alignment_bits = 4; Array m_alignment_bit_probabilities; // LZ state tracking. u16 m_state { 0 }; u32 m_rep0 { 0 }; u32 m_rep1 { 0 }; u32 m_rep2 { 0 }; u32 m_rep3 { 0 }; u32 current_repetition_offset() const; void update_state_after_literal(); void update_state_after_match(); void update_state_after_rep(); void update_state_after_short_rep(); static constexpr size_t maximum_number_of_position_bits = 4; static constexpr size_t number_of_states = 12; Array m_is_match_probabilities; Array m_is_rep_probabilities; Array m_is_rep_g0_probabilities; Array m_is_rep_g1_probabilities; Array m_is_rep_g2_probabilities; Array m_is_rep0_long_probabilities; enum MatchType { Literal, SimpleMatch, RepMatch0, ShortRepMatch, RepMatch1, RepMatch2, RepMatch3, }; }; class LzmaDecompressor : public Stream , LzmaState { public: /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive'). static ErrorOr> create_from_container(MaybeOwned, Optional> dictionary = {}); /// Creates a decompressor from a raw stream of LZMA-compressed data (found inside an LZMA container or embedded in other file formats). static ErrorOr> create_from_raw_stream(MaybeOwned, LzmaDecompressorOptions const&, Optional> dictionary = {}); ErrorOr append_input_stream(MaybeOwned, Optional uncompressed_size); virtual ErrorOr read_some(Bytes) override; virtual ErrorOr write_some(ReadonlyBytes) override; virtual bool is_eof() const override; virtual bool is_open() const override; virtual void close() override; private: LzmaDecompressor(MaybeOwned, LzmaDecompressorOptions, MaybeOwned, FixedArray literal_probabilities); MaybeOwned m_stream; LzmaDecompressorOptions m_options; // This doubles as an output buffer, since we have to write all of our results into this anyways. MaybeOwned m_dictionary; bool m_found_end_of_stream_marker { false }; bool is_range_decoder_in_clean_state() const; bool has_reached_expected_data_size() const; Optional m_leftover_match_length; // Range decoder state (initialized with stream data in LzmaDecompressor::create). u32 m_range_decoder_range { 0xFFFFFFFF }; u32 m_range_decoder_code { 0 }; ErrorOr initialize_range_decoder(); ErrorOr normalize_range_decoder(); ErrorOr decode_direct_bit(); ErrorOr decode_bit_with_probability(Probability& probability); ErrorOr decode_match_type(); // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order). // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size. ErrorOr decode_symbol_using_bit_tree(size_t bit_count, Span probability_tree); ErrorOr decode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree); ErrorOr decode_literal_to_output_buffer(); ErrorOr decode_normalized_match_length(LzmaLengthCoderState&); // This deviates from the specification, which states that "unsigned" is at least 16-bit. // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value. ErrorOr decode_normalized_match_distance(u16 normalized_match_length); }; class LzmaCompressor : public Stream , LzmaState { public: /// Creates a compressor for a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive'). static ErrorOr> create_container(MaybeOwned, LzmaCompressorOptions const&); /// Finishes the archive by writing out the remaining data from the range coder. ErrorOr flush(); virtual ErrorOr read_some(Bytes) override; virtual ErrorOr write_some(ReadonlyBytes) override; virtual bool is_eof() const override; virtual bool is_open() const override; virtual void close() override; virtual ~LzmaCompressor(); private: LzmaCompressor(MaybeOwned, LzmaCompressorOptions, MaybeOwned, FixedArray literal_probabilities); ErrorOr shift_range_encoder(); ErrorOr normalize_range_encoder(); ErrorOr encode_direct_bit(u8 value); ErrorOr encode_bit_with_probability(Probability&, u8 value); ErrorOr encode_symbol_using_bit_tree(size_t bit_count, Span probability_tree, u16 value); ErrorOr encode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree, u16 value); ErrorOr encode_normalized_match_length(LzmaLengthCoderState&, u16 normalized_length); ErrorOr encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance); ErrorOr encode_match_type(MatchType); ErrorOr encode_literal(u8 literal); ErrorOr encode_existing_match(size_t real_distance, size_t real_length); ErrorOr encode_new_match(size_t real_distance, size_t real_length); ErrorOr encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length); ErrorOr encode_once(); bool m_has_flushed_data { false }; MaybeOwned m_stream; LzmaCompressorOptions m_options; // This doubles as an input buffer, which is appended at the very front of the buffer. // Therefore, the size of this should at least be the dictionary size + the largest possible repetition length. MaybeOwned m_dictionary; // Range encoder state. u32 m_range_encoder_range { 0xFFFFFFFF }; u64 m_range_encoder_code { 0 }; // Since the range is only 32-bits, we can overflow at most +1 into the next byte beyond the usual 32-bit code. // Therefore, it is sufficient to store the highest byte (which may still change due to that +1 overflow) and // the length of the chain of 0xFF bytes that may end up propagating that change. u8 m_range_encoder_cached_byte { 0x00 }; size_t m_range_encoder_ff_chain_length { 0 }; }; } template<> struct AK::Traits : public AK::DefaultTraits { static constexpr bool is_trivially_serializable() { return true; } };