ladybird/Userland/Libraries/LibGUI/INILexer.h
Sam Atkins ae6a84c261 LibGUI: Lex INI files as Utf8
Iterating byte by byte meant that the column positions assigned to INI
tokens would be off if there were any multi-byte codepoints. Using a
Utf8View means these positions refer to whole codepoints instead, and
the column positions match what GUI::TextEditor expects. :^)

Fixes #12706.
2023-01-11 10:24:51 +01:00

70 lines
1.2 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2020, Hüseyin Aslıtürk <asliturk@hotmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Utf8View.h>
namespace GUI {
#define FOR_EACH_TOKEN_TYPE \
__TOKEN(Unknown) \
__TOKEN(Comment) \
__TOKEN(Whitespace) \
__TOKEN(Section) \
__TOKEN(LeftBracket) \
__TOKEN(RightBracket) \
__TOKEN(Name) \
__TOKEN(Value) \
__TOKEN(Equal)
struct IniPosition {
size_t line;
size_t column;
};
struct IniToken {
enum class Type {
#define __TOKEN(x) x,
FOR_EACH_TOKEN_TYPE
#undef __TOKEN
};
char const* to_string() const
{
switch (m_type) {
#define __TOKEN(x) \
case Type::x: \
return #x;
FOR_EACH_TOKEN_TYPE
#undef __TOKEN
}
VERIFY_NOT_REACHED();
}
Type m_type { Type::Unknown };
IniPosition m_start;
IniPosition m_end;
};
class IniLexer {
public:
IniLexer(StringView);
Vector<IniToken> lex();
private:
u32 peek(size_t offset = 0) const;
u32 consume();
Utf8View m_input;
Utf8CodePointIterator m_iterator;
IniPosition m_position { 0, 0 };
};
}
#undef FOR_EACH_TOKEN_TYPE