~andreafeletto/public-inbox

zig-scfg: Parse unicode codepoints as per spec v1 SUPERSEDED

Gabriel Sanches: 1
 Parse unicode codepoints as per spec

 2 files changed, 19 insertions(+), 2 deletions(-)
#927792 .build.yml success
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~andreafeletto/public-inbox/patches/38411/mbox | git am -3
Learn more about email & git

[PATCH zig-scfg] Parse unicode codepoints as per spec Export this patch

---
 scfg.zig          |  6 ++++++
 src/Tokenizer.zig | 15 +++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/scfg.zig b/scfg.zig
index 0754316..d2abcbd 100644
--- a/scfg.zig
+++ b/scfg.zig
@@ -70,6 +70,8 @@ test "parse: directives with a block" {
    const source =
        \\model A2 {
        \\  speed 250
        \\  type 自動車
        \\  misc ?
        \\}
        \\model A3 {
        \\  speed 270
@@ -87,6 +89,10 @@ test "parse: directives with a block" {
    try testing.expectEqualStrings("A2", root[0].params[0]);
    try testing.expectEqualStrings("speed", root[0].blocks[0][0].name);
    try testing.expectEqualStrings("250", root[0].blocks[0][0].params[0]);
    try testing.expectEqualStrings("type", root[0].blocks[0][1].name);
    try testing.expectEqualStrings("自動車", root[0].blocks[0][1].params[0]);
    try testing.expectEqualStrings("misc", root[0].blocks[0][2].name);
    try testing.expectEqualStrings("?", root[0].blocks[0][2].params[0]);

    try testing.expectEqualStrings("model", root[1].name);
    try testing.expectEqual(@as(usize, 1), root[1].params.len);
diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig
index e0e724e..94f6b05 100644
--- a/src/Tokenizer.zig
+++ b/src/Tokenizer.zig
@@ -1,4 +1,5 @@
const std = @import("std");
const unicode = std.unicode;
const testing = std.testing;

const Tokenizer = @This();
@@ -48,9 +49,15 @@ pub fn next(self: *Tokenizer) Token {

    while (true) : (self.index += 1) {
        const char = self.source[self.index];
        const codepoint = unicode.utf8Decode(&[_]u8{char}) catch {
            token.tag = .invalid;
            token.loc.end = self.index;
            self.index += 1;
            return token;
        };

        switch (state) {
            .start => switch (char) {
            .start => switch (codepoint) {
                0 => {
                    break;
                },
@@ -69,7 +76,11 @@ pub fn next(self: *Tokenizer) Token {
                    state = .dquote_string;
                    token.tag = .dquote_string;
                },
                'a'...'z', 'A'...'Z', '0'...'9', '_' => {
                0x21, 0x23...0x26, 0x28...0x5B, 0x5D...0x7A, 0x7C, 0x7E, 0x80...0x10FFFF => {
                    // Valid codepoint above...
                    const step = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;

                    self.index += step - 1;
                    state = .bare_string;
                    token.tag = .bare_string;
                },
-- 
2.39.1
zig-scfg/patches/.build.yml: SUCCESS in 34s

[Parse unicode codepoints as per spec][0] from [Gabriel Sanches][1]

[0]: https://lists.sr.ht/~andreafeletto/public-inbox/patches/38411
[1]: mailto:gabriel@gsr.dev

✓ #927792 SUCCESS zig-scfg/patches/.build.yml https://builds.sr.ht/~andreafeletto/job/927792
Thanks for the patch, but I ended up choosing a more radical approach in order to allow multibyte codepoints.