4 2

[PATCH v2] Parse arithmetic expressions with shifts

Cristian Adrián Ontivero
Details
Message ID
<20190125085755.5290-1-cristianontivero@gmail.com>
Download raw message
Patch +118 -5
We introduce the function arithmetic_word() to parse arithmetic
expressions instead of reusing the general word(), and generalize
word_list() to receive a pointer to function, so that word_list() may be
used to parse a list of whatever type of word we need.

This fixes #51, and enables properly parsing parenthesized expressions
inside arithmetic expressions, e.g. $(((2+1)-1)).
---

This is mostly a working proof of concept. There is (as might be expected) quite
a bit of similarity between word() and arithmetic_word(), but I think that it
would be better in the long term to remove the "end" parameter, and have a
couple of different *_word() functions that are called whenever appropriate.

As discussed, the alternative would be adding a mrsh_word_type parameter to
word() and word_list(), but I think that this would eventually lead to a word()
function with a lot of if-statements for each distinct context, and might lead
to a more complex word() function (although preventing the repetition intrinsic
to the *_word() functions alternative).

What do you think?

 include/parser.h |   3 ++
 parser/word.c    | 120 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/include/parser.h b/include/parser.h
index 56e549f..d4b32b9 100644
--- a/include/parser.h
+++ b/include/parser.h
@@ -63,6 +63,8 @@ struct mrsh_parser {
 	void *alias_user_data;
 };
 
+typedef struct mrsh_word * (*word_fn)(struct mrsh_parser *, char end);
+
 size_t parser_peek(struct mrsh_parser *state, char *buf, size_t size);
 char parser_peek_char(struct mrsh_parser *state);
 size_t parser_read(struct mrsh_parser *state, char *buf, size_t size);
@@ -90,5 +92,6 @@ size_t peek_word(struct mrsh_parser *state, char end);
 struct mrsh_word *expect_dollar(struct mrsh_parser *state);
 struct mrsh_word *back_quotes(struct mrsh_parser *state);
 struct mrsh_word *word(struct mrsh_parser *state, char end);
+struct mrsh_word *arithmetic_word(struct mrsh_parser *state, char end);
 
 #endif
diff --git a/parser/word.c b/parser/word.c
index 3abede3..17b327a 100644
--- a/parser/word.c
+++ b/parser/word.c
@@ -173,7 +173,7 @@ char *read_token(struct mrsh_parser *state, size_t len,
 	return tok;
 }
 
-static struct mrsh_word *word_list(struct mrsh_parser *state, char end) {
+static struct mrsh_word *word_list(struct mrsh_parser *state, char end, word_fn fn) {
 	struct mrsh_array children = {0};
 
 	while (true) {
@@ -181,7 +181,7 @@ static struct mrsh_word *word_list(struct mrsh_parser *state, char end) {
 			break;
 		}
 
-		struct mrsh_word *child = word(state, end);
+		struct mrsh_word *child = fn(state, end);
 		if (child == NULL) {
 			break;
 		}
@@ -309,7 +309,7 @@ static struct mrsh_word_parameter *expect_parameter_expression(
 			return NULL;
 		}
 		op_range.end = state->pos;
-		arg = word_list(state, '}');
+		arg = word_list(state, '}', word);
 	}
 
 	struct mrsh_position rbrace_pos = state->pos;
@@ -355,7 +355,7 @@ static struct mrsh_word_arithmetic *expect_word_arithmetic(
 	c = parser_read_char(state);
 	assert(c == '(');
 
-	struct mrsh_word *body = word_list(state, ')');
+	struct mrsh_word *body = word_list(state, ')', arithmetic_word);
 	if (body == NULL) {
 		if (!mrsh_parser_error(state, NULL)) {
 			parser_set_error(state, "expected an arithmetic expression");
@@ -695,7 +695,117 @@ struct mrsh_word *word(struct mrsh_parser *state, char end) {
 	}
 }
 
+/* TODO remove end parameter when no *_word function takes it */
+struct mrsh_word *arithmetic_word(struct mrsh_parser *state, char end) {
+	if (!symbol(state, TOKEN)) {
+		return NULL;
+	}
+
+	char c = parser_peek_char(state);
+	if (is_operator_start(c)) {
+		return NULL;
+	}
+
+	char next[3] = {0};
+	if (c == ')') {
+		parser_peek(state, next, sizeof(*next) * 2);
+		if (!strcmp(next, "))")) {
+			return NULL;
+		}
+	}
+
+	struct mrsh_array children = {0};
+	struct mrsh_buffer buf = {0};
+	struct mrsh_position child_begin = {0};
+
+	while (true) {
+		if (!mrsh_position_valid(&child_begin)) {
+			child_begin = state->pos;
+		}
+
+		parser_peek(state, next, sizeof(*next) * 2);
+		c = next[0];
+		if (c == '\0' || c == '\n' || !strcmp(next, "))")) {
+			break;
+		}
+
+		if (c == '$') {
+			push_buffer_word_string(state, &children, &buf, &child_begin);
+			struct mrsh_word *t = expect_dollar(state);
+			if (t == NULL) {
+				return NULL;
+			}
+			mrsh_array_add(&children, t);
+			continue;
+		}
+
+		if (c == '`') {
+			push_buffer_word_string(state, &children, &buf, &child_begin);
+			struct mrsh_word *t = back_quotes(state);
+			if (t == NULL) {
+				return NULL;
+			}
+			mrsh_array_add(&children, t);
+			continue;
+		}
+
+		// Quoting
+		if (c == '\'') {
+			push_buffer_word_string(state, &children, &buf, &child_begin);
+			struct mrsh_word *t = single_quotes(state);
+			if (t == NULL) {
+				return NULL;
+			}
+			mrsh_array_add(&children, t);
+			continue;
+		}
+		if (c == '"') {
+			push_buffer_word_string(state, &children, &buf, &child_begin);
+			struct mrsh_word *t = double_quotes(state);
+			if (t == NULL) {
+				return NULL;
+			}
+			mrsh_array_add(&children, t);
+			continue;
+		}
+
+		if (c == '\\') {
+			// Unquoted backslash
+			parser_read_char(state);
+			c = parser_peek_char(state);
+			if (c == '\n') {
+				// Continuation line
+				read_continuation_line(state);
+				continue;
+			}
+		} else if (is_operator_start(c) || isblank(c)) {
+			if (strcmp(next, "<<") && strcmp(next, ">>")) {
+				break;
+			}
+			parser_read_char(state);
+			mrsh_buffer_append_char(&buf, c);
+		}
+
+		parser_read_char(state);
+		mrsh_buffer_append_char(&buf, c);
+	}
+
+	push_buffer_word_string(state, &children, &buf, &child_begin);
+	mrsh_buffer_finish(&buf);
+
+	consume_symbol(state);
+
+	if (children.len == 1) {
+		struct mrsh_word *word = children.data[0];
+		mrsh_array_finish(&children); // TODO: don't allocate this array
+		return word;
+	} else {
+		struct mrsh_word_list *wl = mrsh_word_list_create(&children, false);
+		return &wl->word;
+	}
+}
+
 struct mrsh_word *mrsh_parse_word(struct mrsh_parser *state) {
 	parser_begin(state);
-	return word_list(state, 0);
+	return word_list(state, 0, word);
 }
-- 
2.20.1
Details
Message ID
<U7ts9Yh_Z3XOiaWl6AhxSN67-FxIMtbaYeTlvyyyw5DQAS4ikfnS4TYaWhLzU3L__Qchoi8zRWD-ikFm8bmHtmYtoDUqVmfVKpSKHCpQSn0=@emersion.fr>
In-Reply-To
<20190125085755.5290-1-cristianontivero@gmail.com> (view parent)
Download raw message
On Friday, January 25, 2019 9:57 AM, Cristian Adrián Ontivero <cristianontivero@gmail.com> wrote:
> We introduce the function arithmetic_word() to parse arithmetic
> expressions instead of reusing the general word(), and generalize
> word_list() to receive a pointer to function, so that word_list() may be
> used to parse a list of whatever type of word we need.
>
> This fixes #51, and enables properly parsing parenthesized expressions
> inside arithmetic expressions, e.g. $(((2+1)-1)).
> ---
>
> This is mostly a working proof of concept. There is (as might be expected) quite
> a bit of similarity between word() and arithmetic_word(), but I think that it
> would be better in the long term to remove the "end" parameter, and have a
> couple of different *_word() functions that are called whenever appropriate.
>
> As discussed, the alternative would be adding a mrsh_word_type parameter to
> word() and word_list(), but I think that this would eventually lead to a word()
> function with a lot of if-statements for each distinct context, and might lead
> to a more complex word() function (although preventing the repetition intrinsic
> to the *_word() functions alternative).
>
> What do you think?

+1, I like it. Yeah, I agree, the `end` parameter should go away.

Is there anything that prevents this from being merged?

>
>  include/parser.h |   3 ++
>  parser/word.c    | 120 +++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 118 insertions(+), 5 deletions(-)
>
> diff --git a/include/parser.h b/include/parser.h
> index 56e549f..d4b32b9 100644
> --- a/include/parser.h
> +++ b/include/parser.h
> @@ -63,6 +63,8 @@ struct mrsh_parser {
>  	void *alias_user_data;
>  };
>
> +typedef struct mrsh_word * (*word_fn)(struct mrsh_parser *, char end);

Nitpick: the convention I've used so far is to use the `_func` prefix
for function types.
Details
Message ID
<Zh2ZvpHXdKekPe69u6rJWuchJnUzUolJhF4IfzIjZIZFviCQd22c1VGgmSbyKI3FkiZChevzpiIF0SlmeLq5ODLWUeP6-FUHK7ie8zV6S8s=@emersion.fr>
In-Reply-To
<U7ts9Yh_Z3XOiaWl6AhxSN67-FxIMtbaYeTlvyyyw5DQAS4ikfnS4TYaWhLzU3L__Qchoi8zRWD-ikFm8bmHtmYtoDUqVmfVKpSKHCpQSn0=@emersion.fr> (view parent)
Download raw message
On Monday, January 28, 2019 9:48 PM, Simon Ser <contact@emersion.fr> wrote:
> Nitpick: the convention I've used so far is to use the`_func` prefix
> for function types.

I mean suffix, not prefix (thanks Drew!).
Cristian Ontivero
Details
Message ID
<CALvFPysnBQn=1ADtJHnXgj-n-pWGBS631smtWH=_o_j7zeAprw@mail.gmail.com>
In-Reply-To
<U7ts9Yh_Z3XOiaWl6AhxSN67-FxIMtbaYeTlvyyyw5DQAS4ikfnS4TYaWhLzU3L__Qchoi8zRWD-ikFm8bmHtmYtoDUqVmfVKpSKHCpQSn0=@emersion.fr> (view parent)
Download raw message
> Is there anything that prevents this from being merged?

Not really, as far as I could test, it all works, but I wanted to get
your opinion on it first.

> Nitpick: the convention I've used so far is to use the `_func` prefix
> for function types.

My bad, I realized after sending the patch. But now that you mention it, I had
something I wanted to bring up for discussion: the suffix currently used is
`func_t`, but POSIX reserves the `_t` suffix (for a discussion, see
for instance:
https://stackoverflow.com/a/231807/3599459 ). Ultimately, I'll follow whatever
naming convention you see fit, but I wanted to know if you had some particular
reason for disregarding that.
Details
Message ID
<gFhFagtIrfSiLWCJK75hpXbfwoHTnGGXj_OKB7KaoehbSJa6mkUP32yLWP8t-TIlXUOURW1OM_SusCXClleyaoXAzDsSxe87s9OGMsSSQ-0=@emersion.fr>
In-Reply-To
<CALvFPysnBQn=1ADtJHnXgj-n-pWGBS631smtWH=_o_j7zeAprw@mail.gmail.com> (view parent)
Download raw message
On Monday, January 28, 2019 10:24 PM, Cristian Ontivero <cristianontivero@gmail.com> wrote:
> > Is there anything that prevents this from being merged?
>
> Not really, as far as I could test, it all works, but I wanted to get
> your opinion on it first.

Pushed, with a minor fix:

To git.sr.ht:~emersion/mrsh
   a0141f7..7b4f77f  master -> master

Thanks!

> > Nitpick: the convention I've used so far is to use the `_func` prefix
> > for function types.
>
> My bad, I realized after sending the patch. But now that you mention it, I had
> something I wanted to bring up for discussion: the suffix currently used is
> `func_t`, but POSIX reserves the `_t` suffix (for a discussion, see
> for instance:
> https://stackoverflow.com/a/231807/3599459 ). Ultimately, I'll follow whatever
> naming convention you see fit, but I wanted to know if you had some particular
> reason for disregarding that.

Oh, I wasn't aware of this. Hmm, that's a little annoying. I like adding _t
suffixes to make the difference between a type name and a function name or a
variable name. I've been doing this because many, many other libraries do so
too (wayland, pixman, etc). But I guess "everybody cheats" is not a good
enough excuse!

I'd be okay with merging a patch that s/_func_t/_func/ though. I don't typedef
other types like structs, so these should be the only problematic cases.