#!/usr/bin/perl # Unit tests for the pure string helpers in web-lib-funcs.pl. # # web-lib-funcs.pl is a pure library (no `unless(caller)` guard), so a bare # `require` loads it without side effects. Helpers covered here touch no # globals beyond their args, so no stubbing or %gconfig setup is needed. # # Assertions pin the current contract — encoded byte shape, round-trip # identity, structural invariants — not the prettiest possible output. use strict; use warnings; use Test::More; use File::Basename qw(dirname); use File::Spec; my $script = File::Spec->rel2abs( File::Spec->catfile(dirname(__FILE__), '..', 'web-lib-funcs.pl')); require $script; # html_escape # # Always escapes &, <, >, ", ', =. The optional $nodblamp flag uses a # lookahead so existing entity references survive untouched; without it, # every & becomes &, including already-escaped input. subtest 'html_escape' => sub { is(main::html_escape('&"\'='), '<b>&"'=', 'all six dangerous characters are escaped'); is(main::html_escape(undef), '', 'undef input → empty string'); is(main::html_escape(''), '', 'empty input → empty string'); is(main::html_escape('plain text 123'), 'plain text 123', 'whitespace and alphanumerics pass through'); # Default mode double-escapes existing entities — this is the current # contract; the nodblamp flag opts into the smarter behaviour. is(main::html_escape('&'), '&amp;', 'default mode double-escapes &'); is(main::html_escape('&', 1), '&', 'nodblamp preserves existing &'); is(main::html_escape('A', 1), 'A', 'nodblamp preserves numeric entity'); # Note: nodblamp's lookahead matches any &; as an entity, so # made-up names like &x; are treated as entities and not re-escaped. is(main::html_escape('&x;', 1), '&x;', 'nodblamp preserves arbitrary &word; shape'); is(main::html_escape('& ', 1), '& ', 'nodblamp escapes lone &'); # Adversarial XSS payload — none of the dangerous chars survive raw. my $xss = main::html_escape(q{'=&}); unlike($xss, qr/[<>"'=]/, 'no raw HTML-significant characters remain'); unlike($xss, qr/&(?!(amp|lt|gt|quot|#39|#61);)/, 'every & starts a known entity'); }; # html_unescape — inverse of html_escape for the canonical entity set. subtest 'html_unescape' => sub { is(main::html_unescape('<b>&"'='), '&"\'=', 'canonical entity set round-trips'); is(main::html_unescape('a b'), 'a b', '  decodes to a regular space'); is(main::html_unescape(undef), '', 'undef → empty'); is(main::html_unescape(''), '', 'empty → empty'); is(main::html_unescape('no entities here'), 'no entities here', 'plain text passes through unchanged'); }; # html_strip — remove tags, optionally replacing with a sentinel. subtest 'html_strip' => sub { is(main::html_strip('hello'), 'hello', 'simple tags removed'); is(main::html_strip('y'), 'y', 'attribute-bearing tag removed'); # Quoted attribute that contains a >, which would otherwise break a # naive regexp — the implementation accounts for this. is(main::html_strip('y'), 'y', 'quoted > inside attribute does not end tag early'); is(main::html_strip('plain text'), 'plain text', 'plain text untouched'); is(main::html_strip('x', '|'), '|x|', 'replacement string substituted for each tag'); }; # quote_escape — only ' and " (and lone &) are escaped; existing entities # (&xxx; or &#NN;) are preserved. subtest 'quote_escape' => sub { is(main::quote_escape(undef), '', 'undef → empty'); is(main::quote_escape('a&b'), 'a&b', 'lone & escaped'); is(main::quote_escape('a&'), 'a&', 'trailing & escaped'); is(main::quote_escape('a&b'), 'a&b', 'existing & preserved'); is(main::quote_escape('a'b'), 'a'b', 'numeric entity preserved'); is(main::quote_escape(q{a"b'c}), 'a"b'c', 'both quote styles escaped by default'); is(main::quote_escape(q{a"b'c}, q{"}), q{a"b'c}, 'only-quote="\"" escapes only double quotes'); is(main::quote_escape(q{a"b'c}, q{'}), q{a"b'c}, 'only-quote="\'" escapes only single quotes'); }; # quote_literal_escape — escape for inclusion in a Perl string literal. subtest 'quote_literal_escape' => sub { is(main::quote_literal_escape(undef), '', 'undef → empty'); is(main::quote_literal_escape(''), '', 'empty → empty'); # Default (single-quoted target): only \ and ' need escaping. is(main::quote_literal_escape(q{it's}), q{it\'s}, 'single quote escaped'); is(main::quote_literal_escape(q{a\\b}), q{a\\\\b}, 'backslash doubled'); is(main::quote_literal_escape(q{a"b$c@d}), q{a"b$c@d}, 'double-quote / sigils NOT escaped in single-quoted target'); # Double-quoted target: also escape ", $, @ (because they interpolate). is(main::quote_literal_escape(q{a"b$c@d}, q{"}), q{a\"b\$c\@d}, '" $ @ all escaped in double-quoted target'); is(main::quote_literal_escape(q{a\\b}, q{"}), q{a\\\\b}, 'backslash doubled in double-quoted target too'); }; # quote_javascript — hex-escape the unsafe characters for a JS string literal. subtest 'quote_javascript' => sub { is(main::quote_javascript(q{a"b}), 'a\x22b', 'double quote → \x22'); is(main::quote_javascript(q{a'b}), 'a\x27b', 'single quote → \x27'); is(main::quote_javascript('ac'), 'a\x3cb\x3ec', '< and > escaped'); is(main::quote_javascript('a&b'), 'a\x26b', '& escaped'); is(main::quote_javascript('a\\b'), 'a\x5cb', 'backslash escaped'); is(main::quote_javascript('plain text 123'), 'plain text 123', 'safe characters pass through'); }; # urlize / un_urlize — percent-encoding round-trip. subtest 'urlize / un_urlize' => sub { # urlize encodes anything that is not [A-Za-z0-9]. is(main::urlize('abc123'), 'abc123', 'alphanumerics pass through'); is(main::urlize(' '), '%20', 'space encoded'); is(main::urlize('/'), '%2F', 'slash encoded'); is(main::urlize("\n"), '%0A', 'newline encoded'); is(main::urlize(chr(0xff)), '%FF', 'high-bit byte encoded'); is(main::urlize('a b/c'), 'a%20b%2Fc', 'mixed input'); # un_urlize: by default, '+' becomes ' ' (form-encoded). Pass the # second arg true to preserve '+' literally. is(main::un_urlize('a+b'), 'a b', '+ decoded as space by default'); is(main::un_urlize('a+b', 1), 'a+b', '+ preserved with plus-literal flag'); is(main::un_urlize('%20'), ' ', '%20 decoded'); is(main::un_urlize('%c3%a9'), "\xc3\xa9", 'lowercase hex decoded (UTF-8 bytes for é)'); # Round-trip through a binary string. for my $s ('plain', 'a b/c', "binary\x00\x01\xff", '') { # urlize never emits +, so the no-plus mode is safe here. is(main::un_urlize(main::urlize($s), 1), $s, "round-trip preserves ".length($s)." bytes"); } }; # trim — symmetric or asymmetric whitespace stripping. # # Second arg controls which end: # undef/0 → both # -1 → right only # 1 → left only subtest 'trim' => sub { is(main::trim(' hi '), 'hi', 'both ends by default'); is(main::trim(' hi ', -1), ' hi', '-1 strips trailing only'); is(main::trim(' hi ', 1), 'hi ', '1 strips leading only'); is(main::trim('nochange'), 'nochange', 'no-op on tidy input'); is(main::trim(''), '', 'empty stays empty'); is(main::trim("\t\nhi\r\n"), 'hi', 'tabs and newlines counted as whitespace'); }; # trunc — truncate to a "whole word" within a max length. # # Contract: if the cut lands inside a word, back up to the previous # whitespace; if no whitespace precedes (the first word is itself longer # than maxlen), return the partial first word rather than empty. subtest 'trunc' => sub { # Early-exit when input already fits. is(main::trunc('short', 99), 'short', 'no-op when input shorter than max'); is(main::trunc('exact5', 6), 'exact5', 'no-op when input equals max'); # Cut landed at a word boundary — keep the substring intact. is(main::trunc('hello world foo', 11), 'hello world', 'cut at word boundary keeps last whole word'); is(main::trunc('hello world', 5), 'hello', 'cut at word boundary returns first whole word'); # Cut landed mid-word — back up to the previous whitespace. is(main::trunc('foo bar baz', 8), 'foo bar', 'mid-word cut backs up to previous whitespace'); is(main::trunc('a b c', 4), 'a b', 'mid-word cut backs up past a one-char word'); # First word longer than maxlen and no preceding whitespace — fall # back to the partial word rather than empty. is(main::trunc('hellothere', 5), 'hello', 'long first word with no boundary returns partial'); is(main::trunc('abc', 1), 'a', 'maxlen=1 returns the first char when no boundary exists'); }; # indexof — first-index lookup with `eq`. subtest 'indexof' => sub { is(main::indexof('b', 'a', 'b', 'c'), 1, 'returns 0-based index'); is(main::indexof('a', 'a', 'b', 'c'), 0, 'first element'); is(main::indexof('z', 'a', 'b', 'c'), -1, 'missing → -1'); is(main::indexof('a'), -1, 'empty haystack → -1'); is(main::indexof('b', 'a', 'b', 'b'), 1, 'duplicates: first hit wins'); # Numeric needle compared stringwise (eq). is(main::indexof(1, '0', '1', '2'), 1, 'numeric needle matches stringwise'); }; # indexoflc — case-insensitive variant. subtest 'indexoflc' => sub { is(main::indexoflc('B', 'a', 'b', 'c'), 1, 'uppercase needle, lowercase haystack'); is(main::indexoflc('a', 'A', 'B', 'C'), 0, 'lowercase needle, uppercase haystack'); is(main::indexoflc('z', 'a', 'b', 'c'), -1, 'missing → -1'); }; # uniquelc — dedupe by lowercase comparison, preserving first-seen case. subtest 'uniquelc' => sub { is_deeply([main::uniquelc('Foo', 'foo', 'FOO', 'Bar')], ['Foo', 'Bar'], 'first-seen case preserved, later case-variants dropped'); is_deeply([main::uniquelc()], [], 'empty input → empty list'); is_deeply([main::uniquelc('x')], ['x'], 'single element passes through'); }; done_testing();