~sircmpwn/hare-dev

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
28 5

[RFC PATCH] hare.ini proof-of-concept

Details
Message ID
<20230929094553.13966-1-sir@cmpwn.com>
DKIM signature
missing
Download raw message
Patch: +164 -8
---
This is a basic proof of concept for how simple hare.ini support might
start shaping up. This is sufficient to automatically link to system
libraries when importing bindings, which is one of the use-cases for
which hare.ini was proposed:

	$ cat pixman/hare.ini 
	[link]
	libs=$(pkg-config --libs --static pixman-1)
	$ cat main.ha 
	use fmt;
	use pixman;
	use types::c;

	export fn main() void = {
		const ver = pixman::version_string();
		const ver = c::tostr(ver)!;
		fmt::printfln("pixman version {}", ver)!;
	};
	$ hare run main.ha 
	pixman version 0.42.2

 cmd/hare/build.ha       | 46 +++++++++++++++++++--
 cmd/hare/build/ini.ha   | 89 +++++++++++++++++++++++++++++++++++++++++
 cmd/hare/build/types.ha |  2 +-
 cmd/hare/build/util.ha  |  1 -
 hare/module/srcs.ha     | 16 ++++++++
 hare/module/types.ha    |  6 +++
 scripts/gen-stdlib      |  3 +-
 sort/search.ha          |  3 ++
 stdlib.mk               |  6 ++-
 9 files changed, 164 insertions(+), 8 deletions(-)
 create mode 100644 cmd/hare/build/ini.ha

diff --git a/cmd/hare/build.ha b/cmd/hare/build.ha
index fa55878d..a4b51dfe 100644
--- a/cmd/hare/build.ha
+++ b/cmd/hare/build.ha
@@ -76,7 +76,7 @@ fn build(name: str, cmd: *getopt::command) (void | error) = {
		case 'L' =>
			append(ctx.libdirs, opt.1);
		case 'l' =>
			append(ctx.libs, opt.1);
			append(ctx.libs, fmt::asprintf("-l{}", opt.1));
		case 'N' =>
			ast::ident_free(ctx.ns);
			ctx.ns = [];
@@ -126,6 +126,8 @@ fn build(name: str, cmd: *getopt::command) (void | error) = {
		os::exit(os::status::FAILURE);
	};

	const input = if (len(cmd.args) == 0) os::getcwd() else cmd.args[0];

	ctx.arch = arch.qbe_name;
	ctx.cmds = ["",
		os::tryenv("HAREC", "harec"),
@@ -134,14 +136,52 @@ fn build(name: str, cmd: *getopt::command) (void | error) = {
		os::tryenv("LD", arch.ld_cmd),
	];
	set_arch_tags(&ctx.ctx.tags, arch);

	let libc = false;
	if (len(ctx.libs) > 0) {
		libc = true;
		merge_tags(&ctx.ctx.tags, "+libc")?;
		ctx.cmds[build::stage::BIN] = os::tryenv("CC", arch.cc_cmd);
	};

	const input = if (len(cmd.args) == 0) os::getcwd() else cmd.args[0];

	ctx.mods = build::gather(&ctx, os::realpath(input)?)?;
	for (let i = 0z; i < len(ctx.mods); i += 1) {
		const ini = ctx.mods[i].srcs.ini;
		if (ini != "") {
			build::applyconfig(&ctx, &ctx.mods[i], ini)?;
		};
	};

	// If hare.ini added any external libraries, we have to re-gather
	// modules with +libc
	//
	// XXX: This is a bit of a hack. The main issue here is that the list of
	// dependencies might change between +libc and -libc, and if so, maybe
	// we will eliminate all of the hare.ini files which switched on +libc
	// in the first place. Attempting to resolve this recursively is
	// basically impossible so just forget about it and stick with linking
	// to libc if at any point a configuration is produced which would link
	// to libc.
	//
	// Possible solutions for the broader set of hacks:
	//
	// 1. We want to get rid of tag-based file selection and move to
	//    hare.ini for configuring file selection. This problem might be
	//    reduced at that point.
	// 2. We could use a separate pass for finding hare.ini files, before we
	//    gather the rest of the sources, then add +libc before doing the
	//    second step.
	//
	// Both of these solutions would be cleaner, but we might still end up
	// with the problem that the dependency tree changes once +libc is
	// turned on.
	if (len(ctx.libs) > 0 && !libc) {
		merge_tags(&ctx.ctx.tags, "+libc")?;
		ctx.cmds[build::stage::BIN] = os::tryenv("CC", arch.cc_cmd);
		module::free_slice(ctx.mods);
		ctx.mods = build::gather(&ctx, os::realpath(input)?)?;
	};

	append(ctx.hashes, [[void...]...], len(ctx.mods));

	let built = build::execute(&ctx)?;
diff --git a/cmd/hare/build/ini.ha b/cmd/hare/build/ini.ha
new file mode 100644
index 00000000..98cc0d6c
--- /dev/null
+++ b/cmd/hare/build/ini.ha
@@ -0,0 +1,89 @@
use encoding::utf8;
use fmt;
use format::ini;
use fs;
use hare::module;
use io;
use os;
use os::exec;
use sort;
use strings;
use wordexp;

// Applies configuration from a hare.ini file for a given module to the build
// context.
export fn applyconfig(
	ctx: *context,
	mod: *module::module,
	path: str,
) (void | error) = {
	const in = os::open(path)?;
	defer io::close(in)!;
	const scan = ini::scan(in);
	defer ini::finish(&scan);

	for (true) {
		const (sec, key, val) = match (ini::next(&scan)) {
		case let entry: ini::entry =>
			yield entry;
		case io::EOF =>
			break;
		case =>
			return module::invalid_ini;
		};

		const func = match (sort::search(
			conf_sections,
			size((str, *opaque)),
			&sec, &conf_cmp,
		)) {
		case void =>
			continue;
		case let i: size =>
			yield conf_sections[i].1;
		};
		func(ctx, key, val)?;
	};
};

// sorted
const conf_sections = [
	("link", &conf_link),
];

fn conf_link(ctx: *context, key: str, value: str) (void | error) = {
	const func = match (sort::search(
		conf_link_keys,
		size((str, *opaque)),
		&key, &conf_cmp,
	)) {
	case void =>
		return;
	case let i: size =>
		yield conf_link_keys[i].1;
	};
	func(ctx, value)?;
};

// sorted
const conf_link_keys = [
	("libs", &conf_link_libs),
];

fn conf_link_libs(ctx: *context, value: str) (void | error) = {
	match (wordexp::wordexp(value, wordexp::flag::SHOWERR)) {
	case let err: (io::error | exec::error) =>
		return err;
	case (utf8::invalid | wordexp::we_error) =>
		return module::invalid_ini;
	case let words: []str =>
		append(ctx.libs, words...);
		free(words);
	};
};

fn conf_cmp(a: const *opaque, b: const *opaque) int = {
	const key = a: const *str;
	const val = b: const *(str, *opaque);
	return strings::compare(*key, val.0);
};
diff --git a/cmd/hare/build/types.ha b/cmd/hare/build/types.ha
index 34461b6a..b436d436 100644
--- a/cmd/hare/build/types.ha
+++ b/cmd/hare/build/types.ha
@@ -94,7 +94,7 @@ export fn ctx_finish(ctx: *context) void = {
	};
	free(ctx.defines);
	free(ctx.libdirs);
	free(ctx.libs);
	strings::freeall(ctx.libs);
	ast::ident_free(ctx.ns);
	free(ctx.version);
	module::free_slice(ctx.mods);
diff --git a/cmd/hare/build/util.ha b/cmd/hare/build/util.ha
index f421699c..673d1e3b 100644
--- a/cmd/hare/build/util.ha
+++ b/cmd/hare/build/util.ha
@@ -239,7 +239,6 @@ fn get_args(ctx: *context, tmp: str, flags: []str, t: *task) []str = {
			append(args, strings::dup("-Wl,--no-gc-sections"));
		};
		for (let i = 0z; i < len(ctx.libs); i += 1) {
			append(args, strings::dup("-l"));
			append(args, strings::dup(ctx.libs[i]));
		};
		yield []: []str;
diff --git a/hare/module/srcs.ha b/hare/module/srcs.ha
index 07e093a1..ed2bd52f 100644
--- a/hare/module/srcs.ha
+++ b/hare/module/srcs.ha
@@ -27,6 +27,8 @@ export type srcset = struct {
	// source files. These are sorted alphabetically, and are the set of
	// tags that should be used to find this module in the cache.
	seentags: []str,
	// hare.ini, if present, or ""
	ini: str,
	// hare source files (.ha)
	ha: []str,
	// assembly source files (.s)
@@ -45,6 +47,7 @@ export fn finish_srcset(srcs: *srcset) void = {
	strings::freeall(srcs.s);
	strings::freeall(srcs.o);
	strings::freeall(srcs.sc);
	free(srcs.ini);
};

// Find the on-disk path and set of source files for a given module. The path is
@@ -117,6 +120,9 @@ fn path_find(ctx: *context, buf: *path::buffer) (srcset | error) = {
			yield &res.o;
		case "sc" =>
			yield &res.sc;
		case "ini" =>
			res.ini = srcs[i].3[0];
			continue;
		case => abort();
		};
		append(out, srcs[i].3[0]);
@@ -203,6 +209,16 @@ fn _findsrcs(
	};

	if (fs::isfile(stat.mode)) {
		if (path::peek(buf) as str == "hare.ini") {
			append(srcs, (
				strings::dup("hare"),
				strings::dup("ini"),
				0,
				alloc([strings::dup(path::string(buf))]),
			));
			return;
		};

		let ext = match (path::pop_ext(buf)) {
		case void =>
			return;
diff --git a/hare/module/types.ha b/hare/module/types.ha
index c7e3bbbd..17fc9d66 100644
--- a/hare/module/types.ha
+++ b/hare/module/types.ha
@@ -17,6 +17,9 @@ export type tag_has_dot = !void;
// Generic badly formatted tag error.
export type tag_bad_format = !void;

// An invalid hare.ini file is present
export type invalid_ini = !void;

// A dependency cycle error.
export type dep_cycle = ![]str;

@@ -42,6 +45,7 @@ export type error = !(
	tag_has_dot |
	tag_bad_format |
	errcontext |
	invalid_ini |
);

// A container struct for context, used by [[gather]].
@@ -124,6 +128,8 @@ fn _strerror(e: error, buf: *memio::stream) void = {
		memio::concat(buf, ctx.0, ": ")!;
		_strerror(*ctx.1, buf);
		return;
	case invalid_ini =>
		yield "Invalid hare.ini file";
	};
	memio::concat(buf, s)!;
};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
index 2382b56c..79f1aa0c 100755
--- a/scripts/gen-stdlib
+++ b/scripts/gen-stdlib
@@ -210,12 +210,13 @@ bytes() {
cmd_hare_build() {
	gen_srcs cmd::hare::build \
		gather.ha \
		ini.ha \
		queue.ha \
		types.ha \
		util.ha
	gen_ssa cmd::hare::build encoding::hex crypto::sha256 errors fmt fs \
		hare::ast hare::module hare::unparse hash io memio os os::exec path \
		sort strings shlex unix::tty
		sort strings shlex unix::tty format::ini wordexp
}

crypto() {
diff --git a/sort/search.ha b/sort/search.ha
index 5ae78ffd..c31f97a4 100644
--- a/sort/search.ha
+++ b/sort/search.ha
@@ -3,6 +3,9 @@

// Performs a binary search over a sorted slice. If the key is found, index of
// the matching item in the slice is returned. Otherwise, void is returned.
//
// The compare function is called with the key as the first argument, and the
// value to test against in the second argument.
export fn search(
	in: []const opaque,
	sz: size,
diff --git a/stdlib.mk b/stdlib.mk
index f5ddf29b..d864c663 100644
--- a/stdlib.mk
+++ b/stdlib.mk
@@ -968,11 +968,12 @@ $(HARECACHE)/bytes/bytes-any.ssa: $(stdlib_bytes_any_srcs) $(stdlib_rt) $(stdlib
# cmd::hare::build (+any)
stdlib_cmd_hare_build_any_srcs = \
	$(STDLIB)/cmd/hare/build/gather.ha \
	$(STDLIB)/cmd/hare/build/ini.ha \
	$(STDLIB)/cmd/hare/build/queue.ha \
	$(STDLIB)/cmd/hare/build/types.ha \
	$(STDLIB)/cmd/hare/build/util.ha

$(HARECACHE)/cmd/hare/build/cmd_hare_build-any.ssa: $(stdlib_cmd_hare_build_any_srcs) $(stdlib_rt) $(stdlib_encoding_hex_$(PLATFORM)) $(stdlib_crypto_sha256_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_fmt_$(PLATFORM)) $(stdlib_fs_$(PLATFORM)) $(stdlib_hare_ast_$(PLATFORM)) $(stdlib_hare_module_$(PLATFORM)) $(stdlib_hare_unparse_$(PLATFORM)) $(stdlib_hash_$(PLATFORM)) $(stdlib_io_$(PLATFORM)) $(stdlib_memio_$(PLATFORM)) $(stdlib_os_$(PLATFORM)) $(stdlib_os_exec_$(PLATFORM)) $(stdlib_path_$(PLATFORM)) $(stdlib_sort_$(PLATFORM)) $(stdlib_strings_$(PLATFORM)) $(stdlib_shlex_$(PLATFORM)) $(stdlib_unix_tty_$(PLATFORM))
$(HARECACHE)/cmd/hare/build/cmd_hare_build-any.ssa: $(stdlib_cmd_hare_build_any_srcs) $(stdlib_rt) $(stdlib_encoding_hex_$(PLATFORM)) $(stdlib_crypto_sha256_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_fmt_$(PLATFORM)) $(stdlib_fs_$(PLATFORM)) $(stdlib_hare_ast_$(PLATFORM)) $(stdlib_hare_module_$(PLATFORM)) $(stdlib_hare_unparse_$(PLATFORM)) $(stdlib_hash_$(PLATFORM)) $(stdlib_io_$(PLATFORM)) $(stdlib_memio_$(PLATFORM)) $(stdlib_os_$(PLATFORM)) $(stdlib_os_exec_$(PLATFORM)) $(stdlib_path_$(PLATFORM)) $(stdlib_sort_$(PLATFORM)) $(stdlib_strings_$(PLATFORM)) $(stdlib_shlex_$(PLATFORM)) $(stdlib_unix_tty_$(PLATFORM)) $(stdlib_format_ini_$(PLATFORM)) $(stdlib_wordexp_$(PLATFORM))
	@printf 'HAREC \t$@\n'
	@mkdir -p $(HARECACHE)/cmd/hare/build
	@$(stdlib_env) $(HAREC) $(HARECFLAGS) -o $@ -Ncmd::hare::build \
@@ -3452,11 +3453,12 @@ $(TESTCACHE)/bytes/bytes-any.ssa: $(testlib_bytes_any_srcs) $(testlib_rt) $(test
# cmd::hare::build (+any)
testlib_cmd_hare_build_any_srcs = \
	$(STDLIB)/cmd/hare/build/gather.ha \
	$(STDLIB)/cmd/hare/build/ini.ha \
	$(STDLIB)/cmd/hare/build/queue.ha \
	$(STDLIB)/cmd/hare/build/types.ha \
	$(STDLIB)/cmd/hare/build/util.ha

$(TESTCACHE)/cmd/hare/build/cmd_hare_build-any.ssa: $(testlib_cmd_hare_build_any_srcs) $(testlib_rt) $(testlib_encoding_hex_$(PLATFORM)) $(testlib_crypto_sha256_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_fs_$(PLATFORM)) $(testlib_hare_ast_$(PLATFORM)) $(testlib_hare_module_$(PLATFORM)) $(testlib_hare_unparse_$(PLATFORM)) $(testlib_hash_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_memio_$(PLATFORM)) $(testlib_os_$(PLATFORM)) $(testlib_os_exec_$(PLATFORM)) $(testlib_path_$(PLATFORM)) $(testlib_sort_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_shlex_$(PLATFORM)) $(testlib_unix_tty_$(PLATFORM))
$(TESTCACHE)/cmd/hare/build/cmd_hare_build-any.ssa: $(testlib_cmd_hare_build_any_srcs) $(testlib_rt) $(testlib_encoding_hex_$(PLATFORM)) $(testlib_crypto_sha256_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_fs_$(PLATFORM)) $(testlib_hare_ast_$(PLATFORM)) $(testlib_hare_module_$(PLATFORM)) $(testlib_hare_unparse_$(PLATFORM)) $(testlib_hash_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_memio_$(PLATFORM)) $(testlib_os_$(PLATFORM)) $(testlib_os_exec_$(PLATFORM)) $(testlib_path_$(PLATFORM)) $(testlib_sort_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_shlex_$(PLATFORM)) $(testlib_unix_tty_$(PLATFORM)) $(testlib_format_ini_$(PLATFORM)) $(testlib_wordexp_$(PLATFORM))
	@printf 'HAREC \t$@\n'
	@mkdir -p $(TESTCACHE)/cmd/hare/build
	@$(testlib_env) $(HAREC) $(TESTHARECFLAGS) -o $@ -Ncmd::hare::build \
-- 
2.42.0
Details
Message ID
<CVVF401ZB4NP.1R9DVBTA6UJV7@willowbarraco.fr>
In-Reply-To
<20230929094553.13966-1-sir@cmpwn.com> (view parent)
DKIM signature
missing
Download raw message
On Fri Sep 29, 2023 at 11:44 AM CEST, Drew DeVault wrote:
> ---
> This is a basic proof of concept for how simple hare.ini support might
> start shaping up. This is sufficient to automatically link to system
> libraries when importing bindings, which is one of the use-cases for
> which hare.ini was proposed:
>
> 	$ cat pixman/hare.ini 
> 	[link]
> 	libs=$(pkg-config --libs --static pixman-1)
> 	$ cat main.ha 
> 	use fmt;
> 	use pixman;
> 	use types::c;
>
> 	export fn main() void = {
> 		const ver = pixman::version_string();

Is there some pixman/*.ha code with @symbol? How this method is defined?
Details
Message ID
<CVVF4WY2SO59.1OYCHCED21N44@taiga>
In-Reply-To
<CVVF401ZB4NP.1R9DVBTA6UJV7@willowbarraco.fr> (view parent)
DKIM signature
missing
Download raw message
On Fri Sep 29, 2023 at 2:59 PM CEST, Willow Barraco wrote:
> Is there some pixman/*.ha code with @symbol? How this method is defined?

Yes
Details
Message ID
<CVVQAGERU0WI.2TAU9K58TYCNE@d2evs.net>
In-Reply-To
<20230929094553.13966-1-sir@cmpwn.com> (view parent)
DKIM signature
missing
Download raw message
i like the idea of hare.ini as a whole, but i think this needs more work

i think we should also have a global tags= directive rather than
implicitly adding +libc when libs are specified:

tags=+libc

[link]
libcmd=pkg-config foo

which'll allow us to do [link+tag1+tag2] without having weirdness
because of the possibility of eg. [link-libc]

this should be processed in hare::module, allowing us to avoid rerunning
dependency resolution in cmd/hare. [foo+bar+baz] should also be handled
in hare::module, but i don't have strong opinions on whether [link]
should be handled either in hare::module (adding a link: str field to
modules) or in cmd/hare

i think we should also remove -l, since hare.ini obsoletes it, and we
should also get rid of the special case for README in favor of requiring
a hare.ini when there're no hare sources

lastly, i think we can avoid wordexp:

tags=+libc

[link]
# or maybe lib=-lfoo? open to either. note that this wouldn't get
# shlexed, in order to require the use of multiple lib= directives for
# multiple libs. i'd also be fine having a libs= which requires -l and
# does shlex stuff, but i kinda prefer lib= for consistency with libcmd=
# (which does need to be singular because that's how we avoid
# reimplementing a shell parser)
lib=foo

# but the output of this command would be
libcmd=pkg-config bar

# idk if we actually want this, but it's an option
libvar=BAZ

i really really would rather not have a wordexp in the stdlib, and this
design avoids creating our own special alternate format while still
allowing everything we want to be possible here. it should also be
pretty easy to implement

On Fri Sep 29, 2023 at 9:44 AM UTC, Drew DeVault wrote:
> diff --git a/cmd/hare/build/ini.ha b/cmd/hare/build/ini.ha
> @@ -0,0 +1,89 @@
-%<-
> +	for (true) {
> +		const (sec, key, val) = match (ini::next(&scan)) {
> +		case let entry: ini::entry =>
> +			yield entry;
> +		case io::EOF =>
> +			break;
> +		case =>
> +			return module::invalid_ini;
> +		};
> +
> +		const func = match (sort::search(
> +			conf_sections,
> +			size((str, *opaque)),
> +			&sec, &conf_cmp,
> +		)) {
> +		case void =>
> +			continue;
> +		case let i: size =>
> +			yield conf_sections[i].1;
> +		};
> +		func(ctx, key, val)?;
> +	};

i don't think this is warranted:

switch (sec) {
case "link" => conf_link(ctx, key, val)?;
case =>
	return sec: unrecognized_section;
};

ditto for conf_link itself. i'd kinda prefer to throw errors on
unrecognized sections, but i'd also be fine ignoring them like you did
Details
Message ID
<CVVQF6WBG87H.3QRIA68ISADPB@notmylaptop>
In-Reply-To
<CVVQAGERU0WI.2TAU9K58TYCNE@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
On Fri Sep 29, 2023 at 5:44 PM EDT, Ember Sawady wrote:
> lastly, i think we can avoid wordexp:
>
> tags=+libc
>
> [link]
> # or maybe lib=-lfoo? open to either. note that this wouldn't get
> # shlexed, in order to require the use of multiple lib= directives for
> # multiple libs. i'd also be fine having a libs= which requires -l and
> # does shlex stuff, but i kinda prefer lib= for consistency with libcmd=
> # (which does need to be singular because that's how we avoid
> # reimplementing a shell parser)
> lib=foo
>
> # but the output of this command would be
> libcmd=pkg-config bar
>
> # idk if we actually want this, but it's an option
> libvar=BAZ
>
> i really really would rather not have a wordexp in the stdlib, and this
> design avoids creating our own special alternate format while still
> allowing everything we want to be possible here. it should also be
> pretty easy to implement

Very strong +1 to this
Details
Message ID
<CWLO9ZVOYDQK.VV8BKWBQOXPU@taiga>
In-Reply-To
<CVVQAGERU0WI.2TAU9K58TYCNE@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
On Fri Sep 29, 2023 at 11:44 PM CEST, Ember Sawady wrote:
> i really really would rather not have a wordexp in the stdlib, and this
> design avoids creating our own special alternate format while still
> allowing everything we want to be possible here. it should also be
> pretty easy to implement

I really do think that wordexp is better here, but if there's a lot of
resistance I'm not going to push the matter.

I think a lot of the grief around wordexp is neatly solved by defining
it as using the system shell, rather than being POSIX compatible, which
honestly gives a better user experience on future systems like Plan 9
and Gaia.
Details
Message ID
<CWLRMBI69PZW.3ACHNS7JHMWCF@torresjrjr.com>
In-Reply-To
<CVVQAGERU0WI.2TAU9K58TYCNE@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
On Fri Sep 29, 2023 at 10:44 PM BST, Ember Sawady wrote:
> lastly, i think we can avoid wordexp:
>
> tags=+libc
>
> [link]
> # or maybe lib=-lfoo? open to either. note that this wouldn't get
> # shlexed, in order to require the use of multiple lib= directives for
> # multiple libs. i'd also be fine having a libs= which requires -l and
> # does shlex stuff, but i kinda prefer lib= for consistency with libcmd=
> # (which does need to be singular because that's how we avoid
> # reimplementing a shell parser)
> lib=foo
>
> # but the output of this command would be
> libcmd=pkg-config bar
>
> # idk if we actually want this, but it's an option
> libvar=BAZ
>
> i really really would rather not have a wordexp in the stdlib, and this
> design avoids creating our own special alternate format while still
> allowing everything we want to be possible here. it should also be
> pretty easy to implement

I propose (as did on IRC):

	[link]
	libs=-lfoo -lbar

	# environment variable $BAZ
	libs:env=BAZ

	# system shell -c 'pkg-config qux'
	libs:cmd=pkg-config qux

The ':' helps suggest more strongly that these three INI-keys are
related and gather internally into one "libs" key.
Details
Message ID
<CWLYMJ7A08IJ.3230HW9BKQVOT@d2evs.net>
In-Reply-To
<CWLRMBI69PZW.3ACHNS7JHMWCF@torresjrjr.com> (view parent)
DKIM signature
missing
Download raw message
On Mon Oct 30, 2023 at 12:16 PM UTC, Byron Torres wrote:
> I propose (as did on IRC):
>
> 	[link]
> 	libs=-lfoo -lbar
>
> 	# environment variable $BAZ
> 	libs:env=BAZ
>
> 	# system shell -c 'pkg-config qux'
> 	libs:cmd=pkg-config qux
>
> The ':' helps suggest more strongly that these three INI-keys are
> related and gather internally into one "libs" key.

echoing what i said on irc, i quite like this, and i think it's better
than what i initially proposed. i'm not sure libs:env is necessary,
since i expect that it won't be used all that much, and it can be
emulated with `libs:cmd=printf %s "$BAZ"`, but if other people prefer to
have it, i'm not opposed
Details
Message ID
<CWMEQ87NSH4J.287V5ERIRS5BH@taiga>
In-Reply-To
<CWLYMJ7A08IJ.3230HW9BKQVOT@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
On Mon Oct 30, 2023 at 6:45 PM CET, Ember Sawady wrote:
> echoing what i said on irc, i quite like this, and i think it's better
> than what i initially proposed. i'm not sure libs:env is necessary,
> since i expect that it won't be used all that much, and it can be
> emulated with `libs:cmd=printf %s "$BAZ"`, but if other people prefer to
> have it, i'm not opposed

So who's going to expand $BAZ here? Are we running it through sh -c? Are
we splitting the output on spaces?
Details
Message ID
<CWRY142YVUEW.1WFFVAKXLLETR@d2evs.net>
In-Reply-To
<CWMEQ87NSH4J.287V5ERIRS5BH@taiga> (view parent)
DKIM signature
missing
Download raw message
On Tue Oct 31, 2023 at 6:23 AM UTC, Drew DeVault wrote:
> On Mon Oct 30, 2023 at 6:45 PM CET, Ember Sawady wrote:
> > echoing what i said on irc, i quite like this, and i think it's better
> > than what i initially proposed. i'm not sure libs:env is necessary,
> > since i expect that it won't be used all that much, and it can be
> > emulated with `libs:cmd=printf %s "$BAZ"`, but if other people prefer to
> > have it, i'm not opposed
>
> So who's going to expand $BAZ here? Are we running it through sh -c? Are
> we splitting the output on spaces?

hm, good point. i think running it through sh -c makes the most sense,
maybe it should be libs:sh in order to make it clearer that that's what
it's doing? the alternative would be to just shlex and exec it
ourselves, but tbh i'm less of a fan of that
Details
Message ID
<CWSFKET2AN8P.246JWZWHQIQ8G@taiga>
In-Reply-To
<CWRY142YVUEW.1WFFVAKXLLETR@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
Again, I think wordexp is the obvious solution here.
Details
Message ID
<CWSGEXNO1W92.3U43CEMS3T8W8@notmylaptop>
In-Reply-To
<CWSFKET2AN8P.246JWZWHQIQ8G@taiga> (view parent)
DKIM signature
missing
Download raw message
wordexp doesn't really help here though, at least not with libs:cmd (or
libs:sh I guess). I think it makes sense to just use the shell for this,
but since the command is actually being executed by the shell and not
just split, there's not really any reason to use wordexp on its own.

I think splitting libs up strikes a good balance: it's possible to run
commands in the system shell if necessary, but the shell isn't used
universally. IMO the shell is the right tool for the job when the job
is, executing commands. Outside of that context, the word splitting
semantics of the shell just aren't desirable, since there's other better
options. I say the other options are better because it's still not
possible to correctly implement wordexp, assuming wordexp just uses the
system shell (which I agree is the best approach). The best that can be
done are janky hacks like in musl.
Details
Message ID
<CWSRZACZRGAK.XIVY6YVRNH2@d2evs.net>
In-Reply-To
<CWSGEXNO1W92.3U43CEMS3T8W8@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
the other nice thing about libs:sh is that it could be generalized to
all keys, rather than just being for libs

one other thing that occurs to me: libs (without :sh) does still want to
be split, likely with shlex, since this:

libs=-lfoo -lbar

wants to be passed as two args to ld. the output of the shell command
would also need to be shlexed, so this would probably just be handled in
the logic for [link]libs - *:sh could be converted to s/:sh// elsewhere
in the code

so, the full algorithm is:
- for each key in the hare.ini file
  - if the key ends in ":sh", get rid of the :sh on the key and replace
    the value with sh -c "$value". this could also be :cmd, don't have
    strong opinions
  - otherwise, if the key ends in ":env" and we decide that doing this
    is a good idea, get rid of the :env and replace the value with
    ${value}
  - look up the key + section and run the relevant function for it (or
    error out on unrecognized keys)
    - for [link]libs, shlex the value and append the results to the link
      array on the relevant module
    - for tags, call merge_tags(&ctx.ctx.tags, value)
    - probably other things, but these are the two keys i can think of
      off the top of my head
Details
Message ID
<CWSUMT4T1F4R.2QP1E93SPJZNC@notmylaptop>
In-Reply-To
<CWSRZACZRGAK.XIVY6YVRNH2@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
On Tue Nov 7, 2023 at 1:02 PM EST, Ember Sawady wrote:
> one other thing that occurs to me: libs (without :sh) does still want to
> be split, likely with shlex, since this:
>
> libs=-lfoo -lbar
>
> wants to be passed as two args to ld. the output of the shell command
> would also need to be shlexed, so this would probably just be handled in
> the logic for [link]libs - *:sh could be converted to s/:sh// elsewhere
> in the code

Is shlex really necessary here? I feel like it makes more sense to just
split based on whitespace, since I don't think quoting or escaping will
ever be needed here.
Details
Message ID
<CWSUX7TQK7J5.2K1UPDEC8X77B@d2evs.net>
In-Reply-To
<CWSUMT4T1F4R.2QP1E93SPJZNC@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
On Tue Nov 7, 2023 at 8:06 PM UTC, Sebastian wrote:
> On Tue Nov 7, 2023 at 1:02 PM EST, Ember Sawady wrote:
> > one other thing that occurs to me: libs (without :sh) does still want to
> > be split, likely with shlex, since this:
> >
> > libs=-lfoo -lbar
> >
> > wants to be passed as two args to ld. the output of the shell command
> > would also need to be shlexed, so this would probably just be handled in
> > the logic for [link]libs - *:sh could be converted to s/:sh// elsewhere
> > in the code
>
> Is shlex really necessary here? I feel like it makes more sense to just
> split based on whitespace, since I don't think quoting or escaping will
> ever be needed here.

fair enough. i'm fine with either splitting on spaces or doing a shlex
Details
Message ID
<CWT9HJHM9C9M.10C6L31JIBUPC@taiga>
In-Reply-To
<CWSUX7TQK7J5.2K1UPDEC8X77B@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
It sounds like a lot of hacks to avoid what wordexp is designed to do
for fairly arbitrary reasons. IMO wordexp is a spot-on tool for this
use-case.
Details
Message ID
<CWT9TM5YNT4N.27DBJKR3FURXU@notmylaptop>
In-Reply-To
<CWT9HJHM9C9M.10C6L31JIBUPC@taiga> (view parent)
DKIM signature
missing
Download raw message
How are they hacks?
Details
Message ID
<CWT9V766ZJSX.3AA0EQBMWEQEI@taiga>
In-Reply-To
<CWT9TM5YNT4N.27DBJKR3FURXU@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
libs:sh=echo "foo bar" baz

This outputs `foo bar baz` and the quoting is lost. It gets run through
shlex and the quotes are lost, so it's split into [foo, bar, baz]. If
you want it to be [foo bar, baz] you'd need the less intuitive

libs:sh=echo '"foo bar"' baz

Footguns abound.

Or we could use wordexp, which just does the right thing.
Details
Message ID
<CWT9Z7I9GTGC.32463BSBJGH5O@notmylaptop>
In-Reply-To
<CWT9V766ZJSX.3AA0EQBMWEQEI@taiga> (view parent)
DKIM signature
missing
Download raw message
For the use cases in hare.ini I can't think of any time that this would
matter though. Are spaces ever needed for libs?

I'm not sure shlex should be used at all here tbh. If we can make the
assumption that spaces are always used as field separators then there's
not much gained from any approach besides splitting whitespace.
Details
Message ID
<CWTA1VL5L1LY.3P3U92I370GKB@taiga>
In-Reply-To
<CWT9Z7I9GTGC.32463BSBJGH5O@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
On Wed Nov 8, 2023 at 9:08 AM CET, Sebastian wrote:
> For the use cases in hare.ini I can't think of any time that this would
> matter though. Are spaces ever needed for libs?

Yes, if they were installed in a prefix with spaces in the path. Imagine
you have a sysroot at ~/My Projects/sysroots/whatever, for example.
pkg-config should output something like

"-L~/My Projects/sysroots/whatever/usr/lib" -lwhatever
Details
Message ID
<CWTA8ZAF9AI0.3577LVAO1GQF5@notmylaptop>
In-Reply-To
<CWTA1VL5L1LY.3P3U92I370GKB@taiga> (view parent)
DKIM signature
missing
Download raw message
Ah, true, fair enough. I'm still not a fan of wordexp (and would still
like to hear alternative solutions tbh) but I see how it solves problems
like this.

With regard to wordexp: I'd be much more open to it if I could see it
being implemented in Hare in a way that isn't a complete hack. And maybe
that isn't possible; maybe musl's implementation is really the best that
can be done. But that's really my main concern about wordexp in general.

If we put wordexp in the stdlib I also think we should deviate from
POSIX and not support flags like NOCMD, since it provides a false sense
of security, and using wordexp to read untrusted user input is always a
bug.
Details
Message ID
<CWTAIDOKKMXV.2GTD18NX5LMAE@d2evs.net>
In-Reply-To
<CWTA8ZAF9AI0.3577LVAO1GQF5@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
if we do end up going with wordexp, i'd really prefer for it to be
purely internal to the code that handles hare.ini, since it's not a good
interface to encourage using
Details
Message ID
<CWTAJKYS7HSW.3J2M9MBV43FFY@taiga>
In-Reply-To
<CWTAIDOKKMXV.2GTD18NX5LMAE@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
Can you guys explain why wordexp bothers you so?
Details
Message ID
<CWTAPB3F8MGN.3J6X8J1ONV126@notmylaptop>
In-Reply-To
<CWTAJKYS7HSW.3J2M9MBV43FFY@taiga> (view parent)
DKIM signature
missing
Download raw message
I'd be fine with wordexp being a user-facing stdlib module if we end up
using it for hare.ini. My main problems with wordexp as an interface are
actually problems with POSIX sh, but these issues don't matter as much
if wordexp just uses the system shell.

I think that most of the time wordexp isn't the right tool for the job,
since shell expansion is often just a bad idea, and a footgun in the
case of poorly-designed shells like POSIX sh. But I suppose it sometimes
can be useful, so I'm not blanketly opposed to exporting it.

If we don't end up using wordexp for hare.ini (which, again, I'd still
like to see alternatives, just because I dislike shell syntax and its
associated footguns), then I don't think it should be in the stdlib.
Details
Message ID
<CWTC0XO9B35I.2YHXRL55A2B20@taiga>
In-Reply-To
<CWTAPB3F8MGN.3J6X8J1ONV126@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
On Wed Nov 8, 2023 at 9:42 AM CET, Sebastian wrote:
> I think that most of the time wordexp isn't the right tool for the job,
> since shell expansion is often just a bad idea, and a footgun in the
> case of poorly-designed shells like POSIX sh. But I suppose it sometimes
> can be useful, so I'm not blanketly opposed to exporting it.

To put more words to why I think wordexp is appropriate here... Unix is
driven by commands, and commands are driven by the shell. The Hare build
system, and build systems generally, are tools for orchestrating Unix
commands and applying the Unix philosophy; many small single-purpose
tools (compilers, linkers, pkg-config, etc) tied together to build a
program. Each of these tools provides little places to poke at their
behavior, and it's desirable to let users poke at them to get their
builds done. The best tool on Unix for tying together a bunch of
programs and fucking with their command lines and the environment and so
on is the shell. It provides access to every knob you'd conceivably
want or need to turn. Thus, wordexp.

POSIX shell syntax & semantics leaves some things to be desired, but the
pinning of wordexp semantics to the system shell makes it the intuitive
choice for integrating with the way the system in question manages
commands, which is going to match how your makefiles and support scripts
and everything else works and fit neatly into the best tooling for the
job on a given system. This approach to wordexp also lets us avoid
implementing a fucking POSIX shell in the stdlib, keeps the
implementation to a couple hundred lines, and future-proofs us for the
operating systems which have a less shit shell.

We can either embrace the warts of the shell (POSIX, as it might be), or
introduce new warts of our own. I feel like the alternatives proposed
for hare.ini are warty, namely in the conspicious absence of shell
semantics and/or hacky workarounds to re-introduce the parts of shell
that we want to have (generally producing an unappealing result IMHO).

> If we don't end up using wordexp for hare.ini (which, again, I'd still
> like to see alternatives, just because I dislike shell syntax and its
> associated footguns), then I don't think it should be in the stdlib.

fwiw wordexp has been on the stdlib roadmap as a blocker for Hare 1.0
since the day the roadmap was written. I have always worked under the
assumption that it was desirable for Hare.

https://harelang.org/roadmap/
Details
Message ID
<CWZ2PGKQW9EW.CEEF7JJZO2EF@d2evs.net>
In-Reply-To
<CWTC0XO9B35I.2YHXRL55A2B20@taiga> (view parent)
DKIM signature
missing
Download raw message
the issue i have with wordexp is that it's trying to build an idiomatic
hare interface to the shell over the sh(1p) cli, which fundamentally
doesn't provide a comprehensive enough interface to be able to reliably
and maintainably implement wordexp

"pipe this through sh -c" + "use this literal text" isn't quite as
elegant to the end-user, sure, but i don't want to have to maintain a
wordexp implementation in the stdlib

(i've also never been a huge fan of having wordexp on the roadmap, and
i'd be perfectly fine updating the roadmap to remove it from there)
Details
Message ID
<CWZL71Y0IZN3.2AG0SK05BXFIS@cmpwn.com>
In-Reply-To
<CWZ2PGKQW9EW.CEEF7JJZO2EF@d2evs.net> (view parent)
DKIM signature
missing
Download raw message
I mean, it's not that bad, right? Have you seen my latest patch for it?
Details
Message ID
<CWZO64Q1M7GO.270GP1KVCVTIF@notmylaptop>
In-Reply-To
<CWZL71Y0IZN3.2AG0SK05BXFIS@cmpwn.com> (view parent)
DKIM signature
missing
Download raw message
On Wed Nov 15, 2023 at 1:11 PM EST, Drew DeVault wrote:
> I mean, it's not that bad, right? Have you seen my latest patch for it?

It's... pretty bad. Not by fault of your patch specifically (though
there are quite a few problems[1] with your patch), but rather that any
sort of wordexp interface implemented without explicit support from the
shell itself (which /bin/sh doesn't provide) is underpowered. There's no
way to differentiate between any kinds of errors returned from the
shell; they're all treated as syntax errors. What happens if the shell
runs out of memory, for instance? POSIX wordexp is fundamentally broken
by design in this regard (and as such, your patch is too, since it is
modelled after POSIX, though this can be slightly remedied by simply
deviating from POSIX, which you're already doing anyway by using the
system shell).

[1]: https://lists.sr.ht/~sircmpwn/hare-dev/<20230927092830.10949-1-sir%40cmpwn.com>#<CVVQ2J02FY3Z.1RSQLYJHN2NVN@notmylaptop>

In addition, the lack of explicit support from the shell makes it very
easy to "break out" of wordexp and run arbitrary shell commands. This is
true even when NOCMD is supplied, as I demonstrated in my feedback to
your patch:

	$((${#-"("})); echo pwned

musl is also vulnerable to this. Fixing this issue (as well as all other
similar exploits) without reimplementing the word expansion semantics of
the shell yourself is, as far as I can tell, impossible.

I guess you could argue that this isn't actually a problem, since
wordexp shouldn't be used for untrusted input anyway. And to an extent I
agree with this. But IMO this is sufficient justification for the
removal of NOCMD entirely, since it provides a false sense of security
by claiming to implement behavior which is impossible-by-design to
actually correctly implement. But if wordexp can't reliably verify that
input is safe, then what difference does wordexp make from just exec'ing
`/bin/sh -c 'printf ...'`? In the context of libc, wordexp(3) sorta acts
as a glorified system(3).

And again, it's not just your wordexp implementation. musl's
implementation of wordexp is also just, broken. ${var} doesn't work when
WRDE_NOCMD is used. Back when I was implementing wordexp and testing the
behavior of glibc vs musl, I looked at what it would take to fix the
bugs in musl's implementation, and came to the conclusion that it's not
worth it since the entire thing would need to be rewritten, and even
then it would still be broken, just slightly less so.

I could go on and on, but the point I'm trying to make is that shelling
out in wordexp is literally not possible to do correctly. If we use this
approach in the stdlib, wordexp will have a *lot* of asterisks and
caveats alongside it. And I believe that, with these caveats, wordexp
doesn't meet the quality standards that I'd like to be the baseline of
the stdlib. extlib *maybe*, but even then I'm not sure it's worth it.
And of course, a wordexp implementation which doesn't defer to the
system shell and reimplements everything in Hare is too complex for the
stdlib, and cements POSIX sh's flaws. So I just can't think of a good
option.

Again, I'm not blanket opposed to wordexp. I just don't think
implementing wordexp in a high quality not-hacky way is possible. If you
can prove me wrong, then I'd be *much* more open to including wordexp in
the stdlib.

I do see the value that wordexp would provide for hare.ini. Should we
decide that, despite all of the design flaws of wordexp, it's still the
best choice for hare.ini, maybe we should just make it internal to
cmd/hare (I previously said the opposite; that it should be in the
stdlib if it's implemented for hare.ini, but I've since changed my mind:
if wordexp can't be implemented Correctly, then 1. there's no point in
having an implementation that isn't dead simple and easily copyable, and
2. we shouldn't encourage its use).
Details
Message ID
<CX3KXJU37EY5.2RTDHL139M8M@taiga>
In-Reply-To
<CWZO64Q1M7GO.270GP1KVCVTIF@notmylaptop> (view parent)
DKIM signature
missing
Download raw message
On Wed Nov 15, 2023 at 9:31 PM CET, Sebastian wrote:
> It's... pretty bad. Not by fault of your patch specifically (though
> there are quite a few problems[1] with your patch), but rather that any
> sort of wordexp interface implemented without explicit support from the
> shell itself (which /bin/sh doesn't provide) is underpowered. There's no
> way to differentiate between any kinds of errors returned from the
> shell; they're all treated as syntax errors. What happens if the shell
> runs out of memory, for instance? POSIX wordexp is fundamentally broken
> by design in this regard (and as such, your patch is too, since it is
> modelled after POSIX, though this can be slightly remedied by simply
> deviating from POSIX, which you're already doing anyway by using the
> system shell).

Setting aside issues in my patch (next version will fix), I don't think
it's that bad. Why is it necessary to distinguish OOM from other cases?
I think it's fine that we just tell the user "shell expansion failed".
If they're using shell expansion in a case where an OOM is a likely
outcome then that's between them and God, not us.

> In addition, the lack of explicit support from the shell makes it very
> easy to "break out" of wordexp and run arbitrary shell commands. This is
> true even when NOCMD is supplied, as I demonstrated in my feedback to
> your patch:
>
> 	$((${#-"("})); echo pwned

I agree, we should just remove NOCMD and tell people that all uses of
this module should be for input which is trusted to run arbitrary
commands. Simplifies the implementation a lot if we do this, too.

> But if wordexp can't reliably verify that input is safe, then what
> difference does wordexp make from just exec'ing `/bin/sh -c 'printf
> ...'`? In the context of libc, wordexp(3) sorta acts as a glorified
> system(3).

Well, as you have illustriously indicated for us, the actual process of
doing word expansion and splitting it up appropriately is somewhat
involved.

> I could go on and on, but the point I'm trying to make is that shelling
> out in wordexp is literally not possible to do correctly. If we use this
> approach in the stdlib, wordexp will have a *lot* of asterisks and
> caveats alongside it. And I believe that, with these caveats, wordexp
> doesn't meet the quality standards that I'd like to be the baseline of
> the stdlib. extlib *maybe*, but even then I'm not sure it's worth it.
> And of course, a wordexp implementation which doesn't defer to the
> system shell and reimplements everything in Hare is too complex for the
> stdlib, and cements POSIX sh's flaws. So I just can't think of a good
> option.

I trimmed down my implementation a lot, abandoned NOCMD and much of the
code associated with "enforcing" it, and the new patch including tests
and docs is +156/-0 and appropriately caveats it against use with
untrusted user input.

Honestly, at that point I think it's just fine. It's a small module that
does something useful that is not really well supported for by other
approaches.
Reply to thread Export thread (mbox)