mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-22 09:21:57 -05:00
LibLocale: Add Segmenter.h
This is aspirationally in LibLocale in hopes that it does locale-dependent segmentation one day. For now, it just wraps LibUnicode's Segmentation.h. Everything except Segmenter.cpp are from the first commit of LadybirdBrowser/ladybird#218, by trflynn. Co-authored-by: Tim Flynn <trflynn89@serenityos.org>
This commit is contained in:
parent
d77876c6aa
commit
71ea3c53a6
7 changed files with 372 additions and 0 deletions
|
@ -178,6 +178,7 @@ source_set("LibLocale") {
|
|||
"NumberFormat.cpp",
|
||||
"PluralRules.cpp",
|
||||
"RelativeTimeFormat.cpp",
|
||||
"Segmenter.cpp",
|
||||
]
|
||||
deps = [
|
||||
"//AK",
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
set(TEST_SOURCES
|
||||
TestDateTimeFormat.cpp
|
||||
TestLocale.cpp
|
||||
TestSegmenter.cpp
|
||||
)
|
||||
|
||||
foreach(source IN LISTS TEST_SOURCES)
|
||||
|
|
128
Tests/LibLocale/TestSegmenter.cpp
Normal file
128
Tests/LibLocale/TestSegmenter.cpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTest/TestCase.h>
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibLocale/Segmenter.h>
|
||||
|
||||
template<size_t N>
|
||||
static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
|
||||
|
||||
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation)
|
||||
{
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Grapheme);
|
||||
|
||||
segmenter->for_each_boundary(String {}, [&](auto i) {
|
||||
dbgln("{}", i);
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_grapheme_segmentation("a"sv, { 0u, 1u });
|
||||
test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
|
||||
test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
|
||||
|
||||
test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
|
||||
test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
|
||||
test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
|
||||
|
||||
test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
|
||||
test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
|
||||
test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
|
||||
|
||||
test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_grapheme_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
}
|
||||
|
||||
TEST_CASE(grapheme_segmentation_indic_conjunct_break)
|
||||
{
|
||||
test_grapheme_segmentation("\u0915"sv, { 0u, 3u });
|
||||
test_grapheme_segmentation("\u0915a"sv, { 0u, 3u, 4u });
|
||||
test_grapheme_segmentation("\u0915\u0916"sv, { 0u, 3u, 6u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u094D\u0916"sv, { 0u, 9u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u0916"sv, { 0u, 15u });
|
||||
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u0916"sv, { 0u, 15u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 21u });
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u0916"sv, { 0u, 21u });
|
||||
test_grapheme_segmentation("\u0915\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 21u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u0916"sv, { 0u, 27u });
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 27u });
|
||||
|
||||
test_grapheme_segmentation("\u0915\u09BC\u09CD\u09BC\u09CD\u094D\u09BC\u09CD\u09BC\u09CD\u0916"sv, { 0u, 33u });
|
||||
}
|
||||
|
||||
template<size_t N>
|
||||
static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
|
||||
{
|
||||
Vector<size_t> boundaries;
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
|
||||
|
||||
segmenter->for_each_boundary(MUST(String::from_utf8(string)), [&](auto boundary) {
|
||||
boundaries.append(boundary);
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
|
||||
}
|
||||
|
||||
TEST_CASE(word_segmentation)
|
||||
{
|
||||
auto segmenter = Locale::Segmenter::create(Locale::SegmenterGranularity::Word);
|
||||
|
||||
segmenter->for_each_boundary(String {}, [&](auto) {
|
||||
VERIFY_NOT_REACHED();
|
||||
return IterationDecision::Break;
|
||||
});
|
||||
|
||||
test_word_segmentation("a"sv, { 0u, 1u });
|
||||
test_word_segmentation("ab"sv, { 0u, 2u });
|
||||
test_word_segmentation("abc"sv, { 0u, 3u });
|
||||
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
|
||||
test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
|
||||
test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
|
||||
test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
|
||||
|
||||
test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
|
||||
test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
|
||||
test_word_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
|
||||
|
||||
test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
|
||||
test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
|
||||
test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
|
||||
|
||||
test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
|
||||
test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
|
||||
|
||||
test_word_segmentation(
|
||||
"The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
|
||||
{ 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
|
||||
}
|
|
@ -15,6 +15,7 @@ set(SOURCES
|
|||
NumberFormat.cpp
|
||||
PluralRules.cpp
|
||||
RelativeTimeFormat.cpp
|
||||
Segmenter.cpp
|
||||
)
|
||||
|
||||
serenity_lib(LibLocale locale)
|
||||
|
|
|
@ -45,6 +45,8 @@ enum class Weekday : u8;
|
|||
enum class WeekendEndRegion : u8;
|
||||
enum class WeekendStartRegion : u8;
|
||||
|
||||
class Segmenter;
|
||||
|
||||
struct CalendarFormat;
|
||||
struct CalendarPattern;
|
||||
struct CalendarRangePattern;
|
||||
|
|
177
Userland/Libraries/LibLocale/Segmenter.cpp
Normal file
177
Userland/Libraries/LibLocale/Segmenter.cpp
Normal file
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <LibLocale/Locale.h>
|
||||
#include <LibLocale/Segmenter.h>
|
||||
#include <LibUnicode/Segmentation.h>
|
||||
|
||||
namespace Locale {
|
||||
|
||||
SegmenterGranularity segmenter_granularity_from_string(StringView segmenter_granularity)
|
||||
{
|
||||
if (segmenter_granularity == "grapheme"sv)
|
||||
return SegmenterGranularity::Grapheme;
|
||||
if (segmenter_granularity == "sentence"sv)
|
||||
return SegmenterGranularity::Sentence;
|
||||
if (segmenter_granularity == "word"sv)
|
||||
return SegmenterGranularity::Word;
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
StringView segmenter_granularity_to_string(SegmenterGranularity segmenter_granularity)
|
||||
{
|
||||
switch (segmenter_granularity) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
return "grapheme"sv;
|
||||
case SegmenterGranularity::Sentence:
|
||||
return "sentence"sv;
|
||||
case SegmenterGranularity::Word:
|
||||
return "word"sv;
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
class SegmenterImpl : public Segmenter {
|
||||
public:
|
||||
SegmenterImpl(SegmenterGranularity segmenter_granularity)
|
||||
: Segmenter(segmenter_granularity)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~SegmenterImpl() override = default;
|
||||
|
||||
virtual NonnullOwnPtr<Segmenter> clone() const override
|
||||
{
|
||||
return make<SegmenterImpl>(m_segmenter_granularity);
|
||||
}
|
||||
|
||||
virtual void set_segmented_text(String text) override
|
||||
{
|
||||
m_string_storage = move(text);
|
||||
m_segmented_text = Utf8View { m_string_storage.code_points() };
|
||||
}
|
||||
|
||||
virtual void set_segmented_text(Utf16View const& text) override
|
||||
{
|
||||
m_segmented_text = text;
|
||||
}
|
||||
|
||||
void set_segmented_text(Utf32View const& text)
|
||||
{
|
||||
m_segmented_text = text;
|
||||
}
|
||||
|
||||
virtual size_t current_boundary() override
|
||||
{
|
||||
return m_current_boundary;
|
||||
}
|
||||
|
||||
virtual Optional<size_t> previous_boundary(size_t boundary, Inclusive inclusive) override
|
||||
{
|
||||
if (inclusive == Inclusive::Yes)
|
||||
++boundary;
|
||||
auto new_boundary = m_segmented_text.visit([&](auto const& text) { return previous_segmentation_boundary(text, boundary); });
|
||||
if (new_boundary.has_value())
|
||||
m_current_boundary = new_boundary.value();
|
||||
return new_boundary;
|
||||
}
|
||||
|
||||
virtual Optional<size_t> next_boundary(size_t boundary, Inclusive inclusive) override
|
||||
{
|
||||
if (inclusive == Inclusive::Yes)
|
||||
--boundary;
|
||||
auto new_boundary = m_segmented_text.visit([&](auto const& text) { return next_segmentation_boundary(text, boundary); });
|
||||
if (new_boundary.has_value())
|
||||
m_current_boundary = new_boundary.value();
|
||||
return new_boundary;
|
||||
}
|
||||
|
||||
virtual void for_each_boundary(String text, SegmentationCallback callback) override
|
||||
{
|
||||
for_each_segmentation_boundary(text.code_points(), move(callback));
|
||||
}
|
||||
|
||||
virtual void for_each_boundary(Utf16View const& text, SegmentationCallback callback) override
|
||||
{
|
||||
for_each_segmentation_boundary(text, move(callback));
|
||||
}
|
||||
|
||||
virtual void for_each_boundary(Utf32View const& text, SegmentationCallback callback) override
|
||||
{
|
||||
for_each_segmentation_boundary(text, move(callback));
|
||||
}
|
||||
|
||||
virtual bool is_current_boundary_word_like() const override
|
||||
{
|
||||
// FIXME: Implement one day.
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
template<class T>
|
||||
Optional<size_t> previous_segmentation_boundary(T const& text, size_t boundary)
|
||||
{
|
||||
switch (segmenter_granularity()) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
return Unicode::previous_grapheme_segmentation_boundary(text, boundary);
|
||||
case SegmenterGranularity::Sentence:
|
||||
return Unicode::previous_sentence_segmentation_boundary(text, boundary);
|
||||
case SegmenterGranularity::Word:
|
||||
return Unicode::previous_word_segmentation_boundary(text, boundary);
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
Optional<size_t> next_segmentation_boundary(T const& text, size_t boundary)
|
||||
{
|
||||
switch (segmenter_granularity()) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
return Unicode::next_grapheme_segmentation_boundary(text, boundary);
|
||||
case SegmenterGranularity::Sentence:
|
||||
return Unicode::next_sentence_segmentation_boundary(text, boundary);
|
||||
case SegmenterGranularity::Word:
|
||||
return Unicode::next_word_segmentation_boundary(text, boundary);
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void for_each_segmentation_boundary(T const& text, SegmentationCallback callback)
|
||||
{
|
||||
switch (segmenter_granularity()) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
Unicode::for_each_grapheme_segmentation_boundary(text, move(callback));
|
||||
break;
|
||||
case SegmenterGranularity::Sentence:
|
||||
Unicode::for_each_sentence_segmentation_boundary(text, move(callback));
|
||||
break;
|
||||
case SegmenterGranularity::Word:
|
||||
Unicode::for_each_word_segmentation_boundary(text, move(callback));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
size_t m_current_boundary { 0 };
|
||||
String m_string_storage;
|
||||
Variant<Utf8View, Utf16View, Utf32View> m_segmented_text { Utf8View {} };
|
||||
};
|
||||
|
||||
NonnullOwnPtr<Segmenter> Segmenter::create(SegmenterGranularity segmenter_granularity)
|
||||
{
|
||||
return Segmenter::create(default_locale(), segmenter_granularity);
|
||||
}
|
||||
|
||||
NonnullOwnPtr<Segmenter> Segmenter::create(StringView locale, SegmenterGranularity segmenter_granularity)
|
||||
{
|
||||
// FIXME: Implement locale-specific segmentation.
|
||||
(void)locale;
|
||||
return make<SegmenterImpl>(segmenter_granularity);
|
||||
}
|
||||
|
||||
}
|
62
Userland/Libraries/LibLocale/Segmenter.h
Normal file
62
Userland/Libraries/LibLocale/Segmenter.h
Normal file
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Function.h>
|
||||
#include <AK/NonnullOwnPtr.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/StringView.h>
|
||||
|
||||
namespace Locale {
|
||||
|
||||
enum class SegmenterGranularity {
|
||||
Grapheme,
|
||||
Sentence,
|
||||
Word,
|
||||
};
|
||||
SegmenterGranularity segmenter_granularity_from_string(StringView);
|
||||
StringView segmenter_granularity_to_string(SegmenterGranularity);
|
||||
|
||||
class Segmenter {
|
||||
public:
|
||||
static NonnullOwnPtr<Segmenter> create(SegmenterGranularity segmenter_granularity);
|
||||
static NonnullOwnPtr<Segmenter> create(StringView locale, SegmenterGranularity segmenter_granularity);
|
||||
virtual ~Segmenter() = default;
|
||||
|
||||
SegmenterGranularity segmenter_granularity() const { return m_segmenter_granularity; }
|
||||
|
||||
virtual NonnullOwnPtr<Segmenter> clone() const = 0;
|
||||
|
||||
virtual void set_segmented_text(String) = 0;
|
||||
virtual void set_segmented_text(Utf16View const&) = 0;
|
||||
|
||||
virtual size_t current_boundary() = 0;
|
||||
|
||||
enum class Inclusive {
|
||||
No,
|
||||
Yes,
|
||||
};
|
||||
virtual Optional<size_t> previous_boundary(size_t index, Inclusive = Inclusive::No) = 0;
|
||||
virtual Optional<size_t> next_boundary(size_t index, Inclusive = Inclusive::No) = 0;
|
||||
|
||||
using SegmentationCallback = Function<IterationDecision(size_t)>;
|
||||
virtual void for_each_boundary(String, SegmentationCallback) = 0;
|
||||
virtual void for_each_boundary(Utf16View const&, SegmentationCallback) = 0;
|
||||
virtual void for_each_boundary(Utf32View const&, SegmentationCallback) = 0;
|
||||
|
||||
virtual bool is_current_boundary_word_like() const = 0;
|
||||
|
||||
protected:
|
||||
explicit Segmenter(SegmenterGranularity segmenter_granularity)
|
||||
: m_segmenter_granularity(segmenter_granularity)
|
||||
{
|
||||
}
|
||||
|
||||
SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme };
|
||||
};
|
||||
|
||||
}
|
Loading…
Reference in a new issue