[PATCH zig-scfg] Parse unicode codepoints as per spec
Export this patch
---
scfg.zig | 6 ++++++
src/Tokenizer.zig | 15 +++++++++++++--
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/scfg.zig b/scfg.zig
index 0754316..d2abcbd 100644
--- a/scfg.zig
+++ b/scfg.zig
@@ -70,6 +70,8 @@ test "parse: directives with a block" {
const source =
\\model A2 {
\\ speed 250
+ \\ type 自動車
+ \\ misc ?
\\}
\\model A3 {
\\ speed 270
@@ -87,6 +89,10 @@ test "parse: directives with a block" {
try testing.expectEqualStrings("A2", root[0].params[0]);
try testing.expectEqualStrings("speed", root[0].blocks[0][0].name);
try testing.expectEqualStrings("250", root[0].blocks[0][0].params[0]);
+ try testing.expectEqualStrings("type", root[0].blocks[0][1].name);
+ try testing.expectEqualStrings("自動車", root[0].blocks[0][1].params[0]);
+ try testing.expectEqualStrings("misc", root[0].blocks[0][2].name);
+ try testing.expectEqualStrings("?", root[0].blocks[0][2].params[0]);
try testing.expectEqualStrings("model", root[1].name);
try testing.expectEqual(@as(usize, 1), root[1].params.len);
diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig
index e0e724e..94f6b05 100644
--- a/src/Tokenizer.zig
+++ b/src/Tokenizer.zig
@@ -1,4 +1,5 @@
const std = @import("std");
+const unicode = std.unicode;
const testing = std.testing;
const Tokenizer = @This();
@@ -48,9 +49,15 @@ pub fn next(self: *Tokenizer) Token {
while (true) : (self.index += 1) {
const char = self.source[self.index];
+ const codepoint = unicode.utf8Decode(&[_]u8{char}) catch {
+ token.tag = .invalid;
+ token.loc.end = self.index;
+ self.index += 1;
+ return token;
+ };
switch (state) {
- .start => switch (char) {
+ .start => switch (codepoint) {
0 => {
break;
},
@@ -69,7 +76,11 @@ pub fn next(self: *Tokenizer) Token {
state = .dquote_string;
token.tag = .dquote_string;
},
- 'a'...'z', 'A'...'Z', '0'...'9', '_' => {
+ 0x21, 0x23...0x26, 0x28...0x5B, 0x5D...0x7A, 0x7C, 0x7E, 0x80...0x10FFFF => {
+ // Valid codepoint above...
+ const step = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
+
+ self.index += step - 1;
state = .bare_string;
token.tag = .bare_string;
},
--
2.39.1
zig-scfg/patches/.build.yml: SUCCESS in 34s
[Parse unicode codepoints as per spec][0] from [Gabriel Sanches][1]
[0]: https://lists.sr.ht/~andreafeletto/public-inbox/patches/38411
[1]: mailto:gabriel@gsr.dev
✓ #927792 SUCCESS zig-scfg/patches/.build.yml https://builds.sr.ht/~andreafeletto/job/927792
Thanks for the patch, but I ended up choosing a more radical approach in order to allow multibyte codepoints.