Subject: More UTF-8 support and fixes for w3mman2html.cgi Author: Piotr P. Karwasz, Justin B Rye Origin: https://bugs.launchpad.net/ubuntu/+source/w3m/+bug/680202 Origin: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=771004 diff --git a/scripts/w3mman/w3mman2html.cgi.in b/scripts/w3mman/w3mman2html.cgi.in index f430307..2cd00f9 100644 --- a/scripts/w3mman/w3mman2html.cgi.in +++ b/scripts/w3mman/w3mman2html.cgi.in @@ -34,7 +34,6 @@ Content-Type: text/html EOF $keyword =~ s:([^-\w\200-\377.,])::g; open(F, "$MAN -k $keyword 2> /dev/null |"); - @line = (); while() { chop; $_ = &html_quote($_); @@ -126,12 +125,14 @@ while() { s/\&/\&/g; s/\/\>/g; + # non ASCII UTF-8 codepoint + my $utf8="[\300-\337][\200-\277]|[\340-\357][\200-\277]{2}|[\360-\367][\200-\277]{3}|[\370-\373][\200-\277]{4}|[\374\375][\200-\277]{5}"; - s@([\200-\377].)(\010{1,2}\1)+@$1@g; + s@($utf8)(\010\1)+@$1@g; s@(\&\w+;|.)(\010\1)+@$1@g; - s@__\010{1,2}((\)?[\200-\377].(\)?)@$1@g; + s@_\010((\)?($utf8)(\)?)@$1@g; s@_\010((\)?(\&\w+\;|.)(\)?)@$1@g; - s@((\)?[\200-\377].(\)?)\010{1,2}__@$1@g; + s@((\)?($utf8)(\)?)\010_@$1@g; s@((\)?(\&\w+\;|.)(\)?)\010_@$1@g; s@.\010(.)@$1@g; @@ -156,7 +157,7 @@ EOF } s@(http|ftp)://[\w.\-/~]+[\w/]@$&@g; - s@(\W)(mailto:)?(\w[\w.\-]*\@\w[\w.\-]*\.[\w.\-]*\w)@$1$2$3@g; + s@\b(mailto:|)(\w[\w.\-]*\@\w[\w.\-]*\.[\w.\-]*\w)@$1$2@g; s@(\W)(\~?/[\w.][\w.\-/~]*)@$1 . &file_ref($2)@ge; s@(include(<\/?[bu]\>|\s)*\<)([\w.\-/]+)@$1 . &include_ref($3)@ge; if ($prev && m@^\s*(\<[bu]\>)*(\w[\w.\-]*)(\)*(\([\dm]\w*\))@) { @@ -220,7 +221,7 @@ sub is_command { local($p); (! -d && -x) || return 0; - if (! defined(%PATH)) { + if (! %PATH) { for $p (split(":", $ENV{'PATH'})) { $p =~ s@/+$@@; $PATH{$p} = 1;