From 100253bec3f02b7f22fe385ace802e66f7b7f88c Mon Sep 17 00:00:00 2001 From: Joe Cooper Date: Wed, 20 May 2026 15:55:07 -0500 Subject: [PATCH] Fix quirks in trunc and split_quoted_string --- t/web-lib-funcs-paths.t | 14 ++++++------ t/web-lib-funcs-strings.t | 45 +++++++++++++++++---------------------- web-lib-funcs.pl | 19 ++++++++++------- 3 files changed, 38 insertions(+), 40 deletions(-) diff --git a/t/web-lib-funcs-paths.t b/t/web-lib-funcs-paths.t index d1d542bd4..a2d9b6837 100644 --- a/t/web-lib-funcs-paths.t +++ b/t/web-lib-funcs-paths.t @@ -124,12 +124,14 @@ subtest 'split_quoted_string' => sub { ['unbalanced', '"quote'], 'unterminated quote is taken as a bare token'); - # Pure-whitespace input drops everything because no branch tolerates - # a leading-whitespace prefix. Surface this as current behaviour — - # arguably a bug, but documenting it here protects us from a silent - # behaviour change. - is_deeply([main::split_quoted_string(' spaces between ')], [], - 'leading whitespace short-circuits the tokenizer (current behaviour)'); + # Leading and trailing whitespace tolerated around tokens. + is_deeply([main::split_quoted_string(' spaces between ')], + ['spaces', 'between'], + 'leading whitespace tolerated, interior whitespace splits tokens'); + is_deeply([main::split_quoted_string("\tfoo\n")], ['foo'], + 'tabs and newlines treated as whitespace too'); + is_deeply([main::split_quoted_string(' ')], [], + 'pure-whitespace input → empty list'); }; # quote_path — OS-dependent shell quoting. diff --git a/t/web-lib-funcs-strings.t b/t/web-lib-funcs-strings.t index 01639045b..6f3adb238 100644 --- a/t/web-lib-funcs-strings.t +++ b/t/web-lib-funcs-strings.t @@ -170,39 +170,32 @@ subtest 'trim' => sub { # trunc — truncate to a "whole word" within a max length. # -# The implementation cuts at maxlen, then pops one char unconditionally -# and continues popping only while the popped char is whitespace; trailing -# whitespace is then trimmed. This pins current behaviour, which has two -# notable edge cases worth flagging: -# -# * `trunc("hello world foo", 11)` returns "hello worl", losing the -# final 'd' even though substr(0, 11) cleanly ends on a word boundary. -# * `trunc("hello world", 5)` returns "hell" rather than "hello". -# -# These pass today; a future fix to trunc will break these and prompt -# re-review. +# Contract: if the cut lands inside a word, back up to the previous +# whitespace; if no whitespace precedes (the first word is itself longer +# than maxlen), return the partial first word rather than empty. subtest 'trunc' => sub { # Early-exit when input already fits. is(main::trunc('short', 99), 'short', 'no-op when input shorter than max'); is(main::trunc('exact5', 6), 'exact5', 'no-op when input equals max'); - # Truncation lands at a partial word — pops the partial word back to - # whitespace, then trims trailing whitespace. - is(main::trunc('a b c', 4), 'a', 'cuts back through partial word'); - # substr(0,8) = "foo bar ", pop one (always), pop "r" — non-ws so stop. - # Result: "foo ba" (last word "baz" partial → chopped one char short). - is(main::trunc('foo bar baz', 8), 'foo ba', - 'partial word loses one extra char (current behaviour)'); + # Cut landed at a word boundary — keep the substring intact. + is(main::trunc('hello world foo', 11), 'hello world', + 'cut at word boundary keeps last whole word'); + is(main::trunc('hello world', 5), 'hello', + 'cut at word boundary returns first whole word'); - # Edge case: substr cleanly ends on a word boundary. Current behaviour - # still pops one char; pin it. - is(main::trunc('hello world foo', 11), 'hello worl', - 'always pops at least one char even at word boundary (current behaviour)'); - is(main::trunc('hello world', 5), 'hell', - 'always pops at least one char (current behaviour)'); + # Cut landed mid-word — back up to the previous whitespace. + is(main::trunc('foo bar baz', 8), 'foo bar', + 'mid-word cut backs up to previous whitespace'); + is(main::trunc('a b c', 4), 'a b', + 'mid-word cut backs up past a one-char word'); - # Truncating to 1 leaves nothing after the mandatory pop. - is(main::trunc('abc', 1), '', 'maxlen=1 returns empty'); + # First word longer than maxlen and no preceding whitespace — fall + # back to the partial word rather than empty. + is(main::trunc('hellothere', 5), 'hello', + 'long first word with no boundary returns partial'); + is(main::trunc('abc', 1), 'a', + 'maxlen=1 returns the first char when no boundary exists'); }; # indexof — first-index lookup with `eq`. diff --git a/web-lib-funcs.pl b/web-lib-funcs.pl index 9cb93fe27..e98b2feee 100755 --- a/web-lib-funcs.pl +++ b/web-lib-funcs.pl @@ -624,11 +624,14 @@ sub trunc if (length($_[0]) <= $_[1]) { return $_[0]; } -my $str = substr($_[0],0,$_[1]); -my $c; -do { - $c = chop($str); - } while($c !~ /\S/); +my $str = substr($_[0], 0, $_[1]); +# If the cut landed inside a word (next char in the original is +# non-whitespace), back the partial word out — but only when there's +# a word boundary inside $str to back up to. If the first word is +# longer than maxlen, return that partial word rather than empty. +if (substr($_[0], $_[1], 1) =~ /\S/ && $str =~ /\s/) { + $str =~ s/\S+$//; + } $str =~ s/\s+$//; return $str; } @@ -12484,9 +12487,9 @@ sub split_quoted_string { my ($str) = @_; my @rv; -while($str =~ /^"([^"]*)"\s*([\000-\377]*)$/ || - $str =~ /^'([^']*)'\s*([\000-\377]*)$/ || - $str =~ /^(\S+)\s*([\000-\377]*)$/) { +while($str =~ /^\s*"([^"]*)"\s*([\000-\377]*)$/ || + $str =~ /^\s*'([^']*)'\s*([\000-\377]*)$/ || + $str =~ /^\s*(\S+)\s*([\000-\377]*)$/) { push(@rv, $1); $str = $2; }