From 29879a69a4b2eda4f0315027cb1e86964d333221 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 19 Jul 2024 15:38:41 -0400 Subject: [PATCH] AK: Construct Strings from StringBuilder without re-allocating the data Currently, invoking StringBuilder::to_string will re-allocate the string data to construct the String. This is wasteful both in terms of memory and speed. The goal here is to simply hand the string buffer over to String, and let String take ownership of that buffer. To do this, StringBuilder must have the same memory layout as Detail::StringData. This layout is just the members of the StringData class followed by the string itself. So when a StringBuilder is created, we reserve sizeof(StringData) bytes at the front of the buffer. StringData can then construct itself into the buffer with placement new. Things to note: * StringData must now be aware of the actual capacity of its buffer, as that can be larger than the string size. * We must take care not to pass ownership of inlined string buffers, as these live on the stack. --- AK/ByteBuffer.h | 18 +++++++++++++ AK/Forward.h | 2 ++ AK/String.cpp | 17 +++++++++++++ AK/String.h | 3 +++ AK/StringBase.cpp | 13 ++++++++++ AK/StringBase.h | 2 ++ AK/StringBuilder.cpp | 60 +++++++++++++++++++++++++++++++++----------- AK/StringBuilder.h | 11 +++++--- AK/StringData.h | 41 ++++++++++++++++++++++-------- 9 files changed, 139 insertions(+), 28 deletions(-) diff --git a/AK/ByteBuffer.h b/AK/ByteBuffer.h index 15f2aa7638b..ca078e2b286 100644 --- a/AK/ByteBuffer.h +++ b/AK/ByteBuffer.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -301,6 +302,23 @@ public: operator ReadonlyBytes() const { return bytes(); } ALWAYS_INLINE size_t capacity() const { return m_inline ? inline_capacity : m_outline_capacity; } + ALWAYS_INLINE bool is_inline() const { return m_inline; } + + struct OutlineBuffer { + Bytes buffer; + size_t capacity { 0 }; + }; + Optional leak_outline_buffer(Badge) + { + if (m_inline) + return {}; + + auto buffer = bytes(); + m_inline = true; + m_size = 0; + + return OutlineBuffer { buffer, capacity() }; + } private: void move_from(ByteBuffer&& other) diff --git a/AK/Forward.h b/AK/Forward.h index e074ffe455d..b5ba818ff7c 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -16,6 +16,8 @@ namespace AK { namespace Detail { template class ByteBuffer; + +class StringData; } enum class TrailingCodePointTransformation : u8; diff --git a/AK/String.cpp b/AK/String.cpp index ef679ea6931..3d0626427aa 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -96,6 +96,23 @@ ErrorOr String::from_stream(Stream& stream, size_t byte_count) return result; } +ErrorOr String::from_string_builder(Badge, StringBuilder& builder) +{ + if (!Utf8View { builder.string_view() }.validate()) + return Error::from_string_literal("String::from_string_builder: Input was not valid UTF-8"); + + String result; + result.replace_with_string_builder(builder); + return result; +} + +String String::from_string_builder_without_validation(Badge, StringBuilder& builder) +{ + String result; + result.replace_with_string_builder(builder); + return result; +} + ErrorOr String::repeated(u32 code_point, size_t count) { VERIFY(is_unicode(code_point)); diff --git a/AK/String.h b/AK/String.h index 1294496e204..130087eaae8 100644 --- a/AK/String.h +++ b/AK/String.h @@ -57,6 +57,9 @@ public: [[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes); + static ErrorOr from_string_builder(Badge, StringBuilder&); + [[nodiscard]] static String from_string_builder_without_validation(Badge, StringBuilder&); + // Creates a new String from a sequence of UTF-16 encoded code points. static ErrorOr from_utf16(Utf16View const&); diff --git a/AK/StringBase.cpp b/AK/StringBase.cpp index 62746b91a58..622266d1764 100644 --- a/AK/StringBase.cpp +++ b/AK/StringBase.cpp @@ -90,6 +90,19 @@ bool StringBase::operator==(StringBase const& other) const return bytes() == other.bytes(); } +void StringBase::replace_with_string_builder(StringBuilder& builder) +{ + if (builder.length() <= MAX_SHORT_STRING_BYTE_COUNT) { + return replace_with_new_short_string(builder.length(), [&](Bytes buffer) { + builder.string_view().bytes().copy_to(buffer); + }); + } + + destroy_string(); + + m_data = &StringData::create_from_string_builder(builder).leak_ref(); +} + ErrorOr StringBase::replace_with_uninitialized_buffer(size_t byte_count) { if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) diff --git a/AK/StringBase.h b/AK/StringBase.h index a105f5f6173..e6086be2f72 100644 --- a/AK/StringBase.h +++ b/AK/StringBase.h @@ -92,6 +92,8 @@ protected: callback(buffer); } + void replace_with_string_builder(StringBuilder&); + // This is not a trivial operation with storage, so it does not belong here. Unfortunately, it // is impossible to implement it without access to StringData. ErrorOr substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const; diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 33306fed2e4..d6746edd633 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,16 +19,33 @@ namespace AK { +static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData); + +static ErrorOr create_buffer(size_t capacity) +{ + StringBuilder::Buffer buffer; + + if (capacity > StringBuilder::inline_capacity) + TRY(buffer.try_ensure_capacity(STRING_BASE_PREFIX_SIZE + capacity)); + + TRY(buffer.try_resize(STRING_BASE_PREFIX_SIZE)); + return buffer; +} + ErrorOr StringBuilder::create(size_t initial_capacity) { - StringBuilder builder; - TRY(builder.m_buffer.try_ensure_capacity(initial_capacity)); - return builder; + auto buffer = TRY(create_buffer(initial_capacity)); + return StringBuilder { move(buffer) }; } StringBuilder::StringBuilder(size_t initial_capacity) + : m_buffer(MUST(create_buffer(initial_capacity))) +{ +} + +StringBuilder::StringBuilder(Buffer buffer) + : m_buffer(move(buffer)) { - m_buffer.ensure_capacity(initial_capacity); } inline ErrorOr StringBuilder::will_append(size_t size) @@ -47,12 +65,12 @@ inline ErrorOr StringBuilder::will_append(size_t size) size_t StringBuilder::length() const { - return m_buffer.size(); + return m_buffer.size() - STRING_BASE_PREFIX_SIZE; } bool StringBuilder::is_empty() const { - return m_buffer.is_empty(); + return length() == 0; } void StringBuilder::trim(size_t count) @@ -122,14 +140,18 @@ ByteString StringBuilder::to_byte_string() const return ByteString((char const*)data(), length()); } -ErrorOr StringBuilder::to_string() const +ErrorOr StringBuilder::to_string() { - return String::from_utf8(string_view()); + if (m_buffer.is_inline()) + return String::from_utf8(string_view()); + return String::from_string_builder({}, *this); } -String StringBuilder::to_string_without_validation() const +String StringBuilder::to_string_without_validation() { - return String::from_utf8_without_validation(string_view().bytes()); + if (m_buffer.is_inline()) + return String::from_utf8_without_validation(string_view().bytes()); + return String::from_string_builder_without_validation({}, *this); } FlyString StringBuilder::to_fly_string_without_validation() const @@ -144,22 +166,22 @@ ErrorOr StringBuilder::to_fly_string() const u8* StringBuilder::data() { - return m_buffer.data(); + return m_buffer.data() + STRING_BASE_PREFIX_SIZE; } u8 const* StringBuilder::data() const { - return m_buffer.data(); + return m_buffer.data() + STRING_BASE_PREFIX_SIZE; } StringView StringBuilder::string_view() const { - return StringView { data(), m_buffer.size() }; + return m_buffer.span().slice(STRING_BASE_PREFIX_SIZE); } void StringBuilder::clear() { - m_buffer.clear(); + m_buffer.resize(STRING_BASE_PREFIX_SIZE); } ErrorOr StringBuilder::try_append_code_point(u32 code_point) @@ -272,4 +294,14 @@ ErrorOr StringBuilder::try_append_escaped_for_json(StringView string) return {}; } +auto StringBuilder::leak_buffer_for_string_construction(Badge) -> Optional +{ + if (auto buffer = m_buffer.leak_outline_buffer({}); buffer.has_value()) { + clear(); + return buffer; + } + + return {}; +} + } diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index 2dfff324249..5b83bcf0e6c 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -18,6 +18,7 @@ class StringBuilder { public: static constexpr size_t inline_capacity = 256; + using Buffer = Detail::ByteBuffer; using OutputType = ByteString; static ErrorOr create(size_t initial_capacity = inline_capacity); @@ -61,8 +62,8 @@ public: [[nodiscard]] ByteString to_byte_string() const; - [[nodiscard]] String to_string_without_validation() const; - ErrorOr to_string() const; + [[nodiscard]] String to_string_without_validation(); + ErrorOr to_string(); [[nodiscard]] FlyString to_fly_string_without_validation() const; ErrorOr to_fly_string() const; @@ -95,12 +96,16 @@ public: return {}; } + Optional leak_buffer_for_string_construction(Badge); + private: + explicit StringBuilder(Buffer); + ErrorOr will_append(size_t); u8* data(); u8 const* data() const; - Detail::ByteBuffer m_buffer; + Buffer m_buffer; }; } diff --git a/AK/StringData.h b/AK/StringData.h index 26c0aefea07..c029afd6413 100644 --- a/AK/StringData.h +++ b/AK/StringData.h @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace AK::Detail { @@ -20,25 +21,39 @@ public: static ErrorOr> create_uninitialized(size_t byte_count, u8*& buffer) { VERIFY(byte_count); - void* slot = malloc(allocation_size_for_string_data(byte_count)); - if (!slot) { + + auto capacity = allocation_size_for_string_data(byte_count); + void* slot = malloc(capacity); + if (!slot) return Error::from_errno(ENOMEM); - } - auto new_string_data = adopt_ref(*new (slot) StringData(byte_count)); + + auto new_string_data = adopt_ref(*new (slot) StringData(byte_count, capacity)); buffer = const_cast(new_string_data->bytes().data()); return new_string_data; } + static NonnullRefPtr create_from_string_builder(StringBuilder& builder) + { + auto byte_count = builder.length(); + VERIFY(byte_count > MAX_SHORT_STRING_BYTE_COUNT); + + auto buffer = builder.leak_buffer_for_string_construction({}); + VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined. + + return adopt_ref(*new (buffer->buffer.data()) StringData(byte_count, buffer->capacity)); + } + static ErrorOr> create_substring(StringData const& superstring, size_t start, size_t byte_count) { // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. VERIFY(byte_count > MAX_SHORT_STRING_BYTE_COUNT); - void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData)); - if (!slot) { + auto capacity = sizeof(StringData) + sizeof(StringData::SubstringData); + void* slot = malloc(capacity); + if (!slot) return Error::from_errno(ENOMEM); - } - return adopt_ref(*new (slot) StringData(superstring, start, byte_count)); + + return adopt_ref(*new (slot) StringData(superstring, start, byte_count, capacity)); } struct SubstringData { @@ -48,7 +63,7 @@ public: void operator delete(void* ptr) { - kfree_sized(ptr, allocation_size_for_string_data(static_cast(ptr)->m_byte_count)); + kfree_sized(ptr, static_cast(ptr)->m_capacity); } ~StringData() @@ -99,13 +114,15 @@ private: return sizeof(StringData) + (sizeof(char) * length); } - explicit StringData(size_t byte_count) + StringData(size_t byte_count, size_t capacity) : m_byte_count(byte_count) + , m_capacity(capacity) { } - StringData(StringData const& superstring, size_t start, size_t byte_count) + StringData(StringData const& superstring, size_t start, size_t byte_count, size_t capacity) : m_byte_count(byte_count) + , m_capacity(capacity) , m_substring(true) { auto& data = const_cast(substring_data()); @@ -125,6 +142,8 @@ private: } u32 m_byte_count { 0 }; + u32 m_capacity { 0 }; + mutable unsigned m_hash { 0 }; mutable bool m_has_hash { false }; bool m_substring { false };