LibLocale: Add Segmenter.h

This is aspirationally in LibLocale in hopes that it does
locale-dependent segmentation one day.

For now, it just wraps LibUnicode's Segmentation.h.

Everything except Segmenter.cpp are from the first commit of
LadybirdBrowser/ladybird#218, by trflynn.

Co-authored-by: Tim Flynn <trflynn89@serenityos.org>
This commit is contained in:
Nico Weber 2024-11-26 09:52:18 -05:00
parent d77876c6aa
commit 71ea3c53a6
7 changed files with 372 additions and 0 deletions

View file

@ -178,6 +178,7 @@ source_set("LibLocale") {
"NumberFormat.cpp",
"PluralRules.cpp",
"RelativeTimeFormat.cpp",
"Segmenter.cpp",
]
deps = [
"//AK",

View file

@ -1,6 +1,7 @@
set(TEST_SOURCES
TestDateTimeFormat.cpp
TestLocale.cpp
TestSegmenter.cpp
)
foreach(source IN LISTS TEST_SOURCES)

View file

@ -0,0 +1,128 @@
/*
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <AK/Array.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Vector.h>
#include <LibLocale/Segmenter.h>
template<size_t N>
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{
Vector<size_t> boundaries;
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
}
TEST_CASE(grapheme_segmentation)
{
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
segmenter->for_each_boundary(String {}, [&](auto i) {
dbgln("{}", i);
VERIFY_NOT_REACHED();
return IterationDecision::Break;
});
test_grapheme_segmentation("a"sv, { 0u, 1u });
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
test_grapheme_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
}
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
{
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
}
template<size_t N>
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
{
Vector<size_t> boundaries;
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
}
TEST_CASE(word_segmentation)
{
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
segmenter->for_each_boundary(String {}, [&](auto) {
VERIFY_NOT_REACHED();
return IterationDecision::Break;
});
test_word_segmentation("a"sv, { 0u, 1u });
test_word_segmentation("ab"sv, { 0u, 2u });
test_word_segmentation("abc"sv, { 0u, 3u });
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
test_word_segmentation("a👩🏼👨🏻b"sv, { 0u, 1u, 29u, 30u });
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
test_word_segmentation(
"The quick (“brown”) fox cant jump 32.3 feet, right?"sv,
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
}

View file

@ -15,6 +15,7 @@ set(SOURCES
NumberFormat.cpp
PluralRules.cpp
RelativeTimeFormat.cpp
Segmenter.cpp
)
serenity_lib(LibLocale locale)

View file

@ -45,6 +45,8 @@ enum class Weekday : u8;
enum class WeekendEndRegion : u8;
enum class WeekendStartRegion : u8;
class Segmenter;
struct CalendarFormat;
struct CalendarPattern;
struct CalendarRangePattern;

View file

@ -0,0 +1,177 @@
/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <LibLocale/Locale.h>
#include <LibLocale/Segmenter.h>
#include <LibUnicode/Segmentation.h>
namespace Locale {
SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
{
if (segmenter_granularity == "grapheme"sv)
return SegmenterGranularity::Grapheme;
if (segmenter_granularity == "sentence"sv)
return SegmenterGranularity::Sentence;
if (segmenter_granularity == "word"sv)
return SegmenterGranularity::Word;
VERIFY_NOT_REACHED();
}
StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
{
switch (segmenter_granularity) {
case SegmenterGranularity::Grapheme:
return "grapheme"sv;
case SegmenterGranularity::Sentence:
return "sentence"sv;
case SegmenterGranularity::Word:
return "word"sv;
}
VERIFY_NOT_REACHED();
}
class SegmenterImpl : public Segmenter {
public:
SegmenterImpl(SegmenterGranularity segmenter_granularity)
: Segmenter(segmenter_granularity)
{
}
virtual ~SegmenterImpl() override = default;
virtual NonnullOwnPtr<Segmenter> clone() const override
{
return make<SegmenterImpl>(m_segmenter_granularity);
}
virtual void set_segmented_text(String text) override
{
m_string_storage = move(text);
m_segmented_text = Utf8View { m_string_storage.code_points() };
}
virtual void set_segmented_text(Utf16View const& text) override
{
m_segmented_text = text;
}
void set_segmented_text(Utf32View const& text)
{
m_segmented_text = text;
}
virtual size_t current_boundary() override
{
return m_current_boundary;
}
virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
{
if (inclusive == Inclusive::Yes)
++boundary;
auto new_boundary = m_segmented_text.visit([&](auto const& text) { return previous_segmentation_boundary(text, boundary); });
if (new_boundary.has_value())
m_current_boundary = new_boundary.value();
return new_boundary;
}
virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
{
if (inclusive == Inclusive::Yes)
--boundary;
auto new_boundary = m_segmented_text.visit([&](auto const& text) { return next_segmentation_boundary(text, boundary); });
if (new_boundary.has_value())
m_current_boundary = new_boundary.value();
return new_boundary;
}
virtual void for_each_boundary(String text, SegmentationCallback callback) override
{
for_each_segmentation_boundary(text.code_points(), move(callback));
}
virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
{
for_each_segmentation_boundary(text, move(callback));
}
virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
{
for_each_segmentation_boundary(text, move(callback));
}
virtual bool is_current_boundary_word_like() const override
{
// FIXME: Implement one day.
return false;
}
private:
template<class T>
Optional<size_t> previous_segmentation_boundary(T const& text, size_t boundary)
{
switch (segmenter_granularity()) {
case SegmenterGranularity::Grapheme:
return Unicode::previous_grapheme_segmentation_boundary(text, boundary);
case SegmenterGranularity::Sentence:
return Unicode::previous_sentence_segmentation_boundary(text, boundary);
case SegmenterGranularity::Word:
return Unicode::previous_word_segmentation_boundary(text, boundary);
}
VERIFY_NOT_REACHED();
}
template<class T>
Optional<size_t> next_segmentation_boundary(T const& text, size_t boundary)
{
switch (segmenter_granularity()) {
case SegmenterGranularity::Grapheme:
return Unicode::next_grapheme_segmentation_boundary(text, boundary);
case SegmenterGranularity::Sentence:
return Unicode::next_sentence_segmentation_boundary(text, boundary);
case SegmenterGranularity::Word:
return Unicode::next_word_segmentation_boundary(text, boundary);
}
VERIFY_NOT_REACHED();
}
template<class T>
void for_each_segmentation_boundary(T const& text, SegmentationCallback callback)
{
switch (segmenter_granularity()) {
case SegmenterGranularity::Grapheme:
Unicode::for_each_grapheme_segmentation_boundary(text, move(callback));
break;
case SegmenterGranularity::Sentence:
Unicode::for_each_sentence_segmentation_boundary(text, move(callback));
break;
case SegmenterGranularity::Word:
Unicode::for_each_word_segmentation_boundary(text, move(callback));
break;
}
}
size_t m_current_boundary { 0 };
String m_string_storage;
Variant<Utf8View, Utf16View, Utf32View> m_segmented_text { Utf8View {} };
};
NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
{
return Segmenter::create(default_locale(), segmenter_granularity);
}
NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
{
// FIXME: Implement locale-specific segmentation.
(void)locale;
return make<SegmenterImpl>(segmenter_granularity);
}
}

View file

@ -0,0 +1,62 @@
/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Function.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/Optional.h>
#include <AK/StringView.h>
namespace Locale {
enum class SegmenterGranularity {
Grapheme,
Sentence,
Word,
};
SegmenterGranularity segmenter_granularity_from_string(StringView);
StringView segmenter_granularity_to_string(SegmenterGranularity);
class Segmenter {
public:
static NonnullOwnPtr<Segmenter> create(SegmenterGranularity segmenter_granularity);
static NonnullOwnPtr<Segmenter> create(StringView locale, SegmenterGranularity segmenter_granularity);
virtual ~Segmenter() = default;
SegmenterGranularity segmenter_granularity() const { return m_segmenter_granularity; }
virtual NonnullOwnPtr<Segmenter> clone() const = 0;
virtual void set_segmented_text(String) = 0;
virtual void set_segmented_text(Utf16View const&) = 0;
virtual size_t current_boundary() = 0;
enum class Inclusive {
No,
Yes,
};
virtual Optional<size_t> previous_boundary(size_t index, Inclusive = Inclusive::No) = 0;
virtual Optional<size_t> next_boundary(size_t index, Inclusive = Inclusive::No) = 0;
using SegmentationCallback = Function<IterationDecision(size_t)>;
virtual void for_each_boundary(String, SegmentationCallback) = 0;
virtual void for_each_boundary(Utf16View const&, SegmentationCallback) = 0;
virtual void for_each_boundary(Utf32View const&, SegmentationCallback) = 0;
virtual bool is_current_boundary_word_like() const = 0;
protected:
explicit Segmenter(SegmenterGranularity segmenter_granularity)
: m_segmenter_granularity(segmenter_granularity)
{
}
SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme };
};
}