Files
webmin/t/web-lib-funcs-strings.t
2026-05-20 15:55:07 -05:00

229 lines
10 KiB
Perl

#!/usr/bin/perl
# Unit tests for the pure string helpers in web-lib-funcs.pl.
#
# web-lib-funcs.pl is a pure library (no `unless(caller)` guard), so a bare
# `require` loads it without side effects. Helpers covered here touch no
# globals beyond their args, so no stubbing or %gconfig setup is needed.
#
# Assertions pin the current contract — encoded byte shape, round-trip
# identity, structural invariants — not the prettiest possible output.
use strict;
use warnings;
use Test::More;
use File::Basename qw(dirname);
use File::Spec;
my $script = File::Spec->rel2abs(
File::Spec->catfile(dirname(__FILE__), '..', 'web-lib-funcs.pl'));
require $script;
# html_escape
#
# Always escapes &, <, >, ", ', =. The optional $nodblamp flag uses a
# lookahead so existing entity references survive untouched; without it,
# every & becomes &amp;, including already-escaped input.
subtest 'html_escape' => sub {
is(main::html_escape('<b>&"\'='),
'&lt;b&gt;&amp;&quot;&#39;&#61;',
'all six dangerous characters are escaped');
is(main::html_escape(undef), '', 'undef input → empty string');
is(main::html_escape(''), '', 'empty input → empty string');
is(main::html_escape('plain text 123'), 'plain text 123',
'whitespace and alphanumerics pass through');
# Default mode double-escapes existing entities — this is the current
# contract; the nodblamp flag opts into the smarter behaviour.
is(main::html_escape('&amp;'), '&amp;amp;', 'default mode double-escapes &amp;');
is(main::html_escape('&amp;', 1), '&amp;', 'nodblamp preserves existing &amp;');
is(main::html_escape('&#65;', 1), '&#65;', 'nodblamp preserves numeric entity');
# Note: nodblamp's lookahead matches any &<letters>; as an entity, so
# made-up names like &x; are treated as entities and not re-escaped.
is(main::html_escape('&x;', 1), '&x;', 'nodblamp preserves arbitrary &word; shape');
is(main::html_escape('& ', 1), '&amp; ', 'nodblamp escapes lone &');
# Adversarial XSS payload — none of the dangerous chars survive raw.
my $xss = main::html_escape(q{<script>alert("x")</script>'=&});
unlike($xss, qr/[<>"'=]/, 'no raw HTML-significant characters remain');
unlike($xss, qr/&(?!(amp|lt|gt|quot|#39|#61);)/,
'every & starts a known entity');
};
# html_unescape — inverse of html_escape for the canonical entity set.
subtest 'html_unescape' => sub {
is(main::html_unescape('&lt;b&gt;&amp;&quot;&#39;&#61;'),
'<b>&"\'=', 'canonical entity set round-trips');
is(main::html_unescape('a&nbsp;b'), 'a b',
'&nbsp; decodes to a regular space');
is(main::html_unescape(undef), '', 'undef → empty');
is(main::html_unescape(''), '', 'empty → empty');
is(main::html_unescape('no entities here'), 'no entities here',
'plain text passes through unchanged');
};
# html_strip — remove tags, optionally replacing with a sentinel.
subtest 'html_strip' => sub {
is(main::html_strip('<b>hello</b>'), 'hello',
'simple tags removed');
is(main::html_strip('<a href="x">y</a>'), 'y',
'attribute-bearing tag removed');
# Quoted attribute that contains a >, which would otherwise break a
# naive regexp — the implementation accounts for this.
is(main::html_strip('<a href=">">y</a>'), 'y',
'quoted > inside attribute does not end tag early');
is(main::html_strip('plain text'), 'plain text',
'plain text untouched');
is(main::html_strip('<b>x</b>', '|'), '|x|',
'replacement string substituted for each tag');
};
# quote_escape — only ' and " (and lone &) are escaped; existing entities
# (&xxx; or &#NN;) are preserved.
subtest 'quote_escape' => sub {
is(main::quote_escape(undef), '', 'undef → empty');
is(main::quote_escape('a&b'), 'a&amp;b', 'lone & escaped');
is(main::quote_escape('a&'), 'a&amp;', 'trailing & escaped');
is(main::quote_escape('a&amp;b'), 'a&amp;b', 'existing &amp; preserved');
is(main::quote_escape('a&#39;b'), 'a&#39;b', 'numeric entity preserved');
is(main::quote_escape(q{a"b'c}), 'a&quot;b&#39;c',
'both quote styles escaped by default');
is(main::quote_escape(q{a"b'c}, q{"}), q{a&quot;b'c},
'only-quote="\"" escapes only double quotes');
is(main::quote_escape(q{a"b'c}, q{'}), q{a"b&#39;c},
'only-quote="\'" escapes only single quotes');
};
# quote_literal_escape — escape for inclusion in a Perl string literal.
subtest 'quote_literal_escape' => sub {
is(main::quote_literal_escape(undef), '', 'undef → empty');
is(main::quote_literal_escape(''), '', 'empty → empty');
# Default (single-quoted target): only \ and ' need escaping.
is(main::quote_literal_escape(q{it's}), q{it\'s}, 'single quote escaped');
is(main::quote_literal_escape(q{a\\b}), q{a\\\\b}, 'backslash doubled');
is(main::quote_literal_escape(q{a"b$c@d}), q{a"b$c@d},
'double-quote / sigils NOT escaped in single-quoted target');
# Double-quoted target: also escape ", $, @ (because they interpolate).
is(main::quote_literal_escape(q{a"b$c@d}, q{"}),
q{a\"b\$c\@d},
'" $ @ all escaped in double-quoted target');
is(main::quote_literal_escape(q{a\\b}, q{"}), q{a\\\\b},
'backslash doubled in double-quoted target too');
};
# quote_javascript — hex-escape the unsafe characters for a JS string literal.
subtest 'quote_javascript' => sub {
is(main::quote_javascript(q{a"b}), 'a\x22b', 'double quote → \x22');
is(main::quote_javascript(q{a'b}), 'a\x27b', 'single quote → \x27');
is(main::quote_javascript('a<b>c'), 'a\x3cb\x3ec', '< and > escaped');
is(main::quote_javascript('a&b'), 'a\x26b', '& escaped');
is(main::quote_javascript('a\\b'), 'a\x5cb', 'backslash escaped');
is(main::quote_javascript('plain text 123'), 'plain text 123',
'safe characters pass through');
};
# urlize / un_urlize — percent-encoding round-trip.
subtest 'urlize / un_urlize' => sub {
# urlize encodes anything that is not [A-Za-z0-9].
is(main::urlize('abc123'), 'abc123', 'alphanumerics pass through');
is(main::urlize(' '), '%20', 'space encoded');
is(main::urlize('/'), '%2F', 'slash encoded');
is(main::urlize("\n"), '%0A', 'newline encoded');
is(main::urlize(chr(0xff)), '%FF', 'high-bit byte encoded');
is(main::urlize('a b/c'), 'a%20b%2Fc', 'mixed input');
# un_urlize: by default, '+' becomes ' ' (form-encoded). Pass the
# second arg true to preserve '+' literally.
is(main::un_urlize('a+b'), 'a b', '+ decoded as space by default');
is(main::un_urlize('a+b', 1), 'a+b', '+ preserved with plus-literal flag');
is(main::un_urlize('%20'), ' ', '%20 decoded');
is(main::un_urlize('%c3%a9'), "\xc3\xa9",
'lowercase hex decoded (UTF-8 bytes for é)');
# Round-trip through a binary string.
for my $s ('plain', 'a b/c', "binary\x00\x01\xff",
'<script>alert(1)</script>') {
# urlize never emits +, so the no-plus mode is safe here.
is(main::un_urlize(main::urlize($s), 1), $s,
"round-trip preserves ".length($s)." bytes");
}
};
# trim — symmetric or asymmetric whitespace stripping.
#
# Second arg controls which end:
# undef/0 → both
# -1 → right only
# 1 → left only
subtest 'trim' => sub {
is(main::trim(' hi '), 'hi', 'both ends by default');
is(main::trim(' hi ', -1), ' hi', '-1 strips trailing only');
is(main::trim(' hi ', 1), 'hi ', '1 strips leading only');
is(main::trim('nochange'), 'nochange', 'no-op on tidy input');
is(main::trim(''), '', 'empty stays empty');
is(main::trim("\t\nhi\r\n"), 'hi', 'tabs and newlines counted as whitespace');
};
# trunc — truncate to a "whole word" within a max length.
#
# Contract: if the cut lands inside a word, back up to the previous
# whitespace; if no whitespace precedes (the first word is itself longer
# than maxlen), return the partial first word rather than empty.
subtest 'trunc' => sub {
# Early-exit when input already fits.
is(main::trunc('short', 99), 'short', 'no-op when input shorter than max');
is(main::trunc('exact5', 6), 'exact5', 'no-op when input equals max');
# Cut landed at a word boundary — keep the substring intact.
is(main::trunc('hello world foo', 11), 'hello world',
'cut at word boundary keeps last whole word');
is(main::trunc('hello world', 5), 'hello',
'cut at word boundary returns first whole word');
# Cut landed mid-word — back up to the previous whitespace.
is(main::trunc('foo bar baz', 8), 'foo bar',
'mid-word cut backs up to previous whitespace');
is(main::trunc('a b c', 4), 'a b',
'mid-word cut backs up past a one-char word');
# First word longer than maxlen and no preceding whitespace — fall
# back to the partial word rather than empty.
is(main::trunc('hellothere', 5), 'hello',
'long first word with no boundary returns partial');
is(main::trunc('abc', 1), 'a',
'maxlen=1 returns the first char when no boundary exists');
};
# indexof — first-index lookup with `eq`.
subtest 'indexof' => sub {
is(main::indexof('b', 'a', 'b', 'c'), 1, 'returns 0-based index');
is(main::indexof('a', 'a', 'b', 'c'), 0, 'first element');
is(main::indexof('z', 'a', 'b', 'c'), -1, 'missing → -1');
is(main::indexof('a'), -1, 'empty haystack → -1');
is(main::indexof('b', 'a', 'b', 'b'), 1, 'duplicates: first hit wins');
# Numeric needle compared stringwise (eq).
is(main::indexof(1, '0', '1', '2'), 1, 'numeric needle matches stringwise');
};
# indexoflc — case-insensitive variant.
subtest 'indexoflc' => sub {
is(main::indexoflc('B', 'a', 'b', 'c'), 1, 'uppercase needle, lowercase haystack');
is(main::indexoflc('a', 'A', 'B', 'C'), 0, 'lowercase needle, uppercase haystack');
is(main::indexoflc('z', 'a', 'b', 'c'), -1, 'missing → -1');
};
# uniquelc — dedupe by lowercase comparison, preserving first-seen case.
subtest 'uniquelc' => sub {
is_deeply([main::uniquelc('Foo', 'foo', 'FOO', 'Bar')],
['Foo', 'Bar'],
'first-seen case preserved, later case-variants dropped');
is_deeply([main::uniquelc()], [], 'empty input → empty list');
is_deeply([main::uniquelc('x')], ['x'], 'single element passes through');
};
done_testing();