Mercurial > notdcc
diff misc/man2html @ 0:c7f6b056b673
First import of vendor version
author | Peter Gervai <grin@grin.hu> |
---|---|
date | Tue, 10 Mar 2009 13:49:58 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/man2html Tue Mar 10 13:49:58 2009 +0100 @@ -0,0 +1,625 @@ +#! /usr/bin/perl +##---------------------------------------------------------------------------## +## File: +## @(#) man2html 1.2 97/08/12 12:57:30 @(#) +## Author: +## Earl Hood, ehood@medusa.acs.uci.edu +## Description: +## man2html is a Perl program to convert formatted nroff output +## to HTML. +## +## Recommend command-line options based on platform: +## +## Platform Options +## --------------------------------------------------------------------- +## c2mp <None, the defaults should be okay> +## hp9000s700/800 -leftm 1 -topm 8 +## sun4 -sun +## --------------------------------------------------------------------- +## +##---------------------------------------------------------------------------## +## Copyright (C) 1995-1997 Earl Hood, ehood@medusa.acs.uci.edu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +## 02111-1307, USA +##---------------------------------------------------------------------------## + +package Man2Html; + +use Getopt::Long; + +($PROG = $0) =~ s/.*\///; +$VERSION = "3.0.1"; + +## Input and outputs filehandles +$InFH = \*STDIN unless $InFH; +$OutFH = \*STDOUT unless $OutFH; + +## Backspace character: Used in overstriking detection +*bs = \"\b"; + +## Hash of section titles and their HTML tag wrapper. +## This list allows customization of what HTML tag is used for +## a given section head. +## +## The section title can be a regular expression. Therefore, one must +## be careful about quoting special characters. +## +%SectionHead = ( + + '\S.*OPTIONS.*' => '<H2>', + 'AUTHORS?' => '<H2>', + 'BUGS' => '<H2>', + 'COMPATIBILITY' => '<H2>', + 'DEPENDENCIES' => '<H2>', + 'DESCRIPTION' => '<H2>', + 'DIAGNOSTICS' => '<H2>', + 'ENVIRONMENT' => '<H2>', + 'ERRORS' => '<H2>', + 'EXAMPLES' => '<H2>', + 'EXTERNAL INFLUENCES' => '<H2>', + 'FILES' => '<H2>', + 'LIMITATIONS' => '<H2>', + 'NAME' => '<H2>', + 'NOTES?' => '<H2>', + 'OPTIONS' => '<H2>', + 'REFERENCES' => '<H2>', + 'RETURN VALUE' => '<H2>', + 'SECTION.*:' => '<H2>', + 'SEE ALSO' => '<H2>', + 'STANDARDS CONFORMANCE' => '<H2>', + 'STYLE CONVENTION' => '<H2>', + 'SYNOPSIS' => '<H2>', + 'SYNTAX' => '<H2>', + 'WARNINGS' => '<H2>', + '\s+Section.*:' => '<H3>', + +); + +## Fallback tag if above is not found +$HeadFallback = '<H2>'; + +## Other gobals + +$Bare = 0; # Skip printing HTML head/foot flag +$BTag = 'B'; # Overstrike tag +$CgiUrl = ''; # CGI URL expression +$Compress = 0; # Do blank line compression flag +$K = 0; # Do keyword search processing flag +$NoDepage = 0; # Do not strip page information +$NoHeads = 0; # Do no header detection flag +$SeeAlso = 0; # Do only SEE ALSO xrefs flag +$Solaris = 0; # Solaris keyword search processing flag +$Sun = 0; # Headers not overstriken flag +$Title = 'FIX ME'; # Title +$UTag = 'I'; # Underline tag +$ftsz = 7; # Bottome margin size +$hdsz = 7; # Top margin size +$leftm = ''; # Left margin pad +$leftmsz = 0; # Left margin size +$pgsz = 66; # Size of page size +$txsz = 52; # Text body length size + +############################################################################# +## Main Block +############################################################################# +{ + if (get_cli_opts()) { + if ($K) { + man_k(); + } else { + do_it(); + } + } else { + usage(); + } +} + +############################################################################# +## Subroutines +############################################################################# + +sub do_it { + + ## Define while loop and then eval it when used. The reason + ## is to avoid the regular expression reevaulation in the + ## section head detection code. + + $doitcode =<<'EndOfDoItCode'; + + my($line, $tmp, $i, $head, $preindent, $see_also, $do); + + $see_also = !$SeeAlso; + print $OutFH "<!-- Manpage converted by man2html $VERSION -->\n"; + LOOP: while(!eof($InFH)) { + $blank = 0; + for ($i=0; $i < $hdsz; $i++) { + last LOOP unless defined($_ = <$InFH>); + } + for ($i=0; $i < $txsz; $i++) { + last LOOP unless defined($_ = <$InFH>); + + ## Check if compress consecutive blank lines + if ($Compress and !/\S/) { + if ($blank) { next; } else { $blank = 1; } + } else { + $blank = 0; + } + + ## Try to check if line space is needed at page boundaries ## + if (!$NoDepage && ($i==0 || $i==($txsz-1)) && !/^\s*$/) { + /^(\s*)/; $tmp = length($1); + if ($do) { + if ($tmp < $preindent) { print $OutFH "\n"; } + } else { + $do = 1; + } + $preindent = $tmp; + } else { + $do = 0; $preindent = 0; + } + + ## Interpret line + $line = $_; + entitize(\$_); # Convert [$<>] to entity references + + ## Check for 'SEE ALSO' link only + if (!$see_also && $CgiUrl && $SeeAlso) { + ($tmp = $line) =~ s/.\010//go; + if ($tmp =~ /^\s*SEE\s+ALSO\s*$/o) { $see_also = 1; } + else { $see_also = 0; } + } + + ## Create anchor links for manpage references + s/((((.\010)+)?[\+_\.\w-])+\(((.\010)+)? + \d((.\010)+)?\w?\)) + /make_xref($1) + /geox if $see_also; + + ## Emphasize underlined words + # s/((_\010[^_])+[\.\(\)_]?(_\010[^_])+\)?)/emphasize($1)/oge; + # s/((_\010[^_])+([\.\(\)_]?(_\010[^_])+)?)/emphasize($1)/oge; + # + # The previous expressions were trying to be clever about + # detecting underlined text which contain non-alphanumeric + # characters. nroff will not underline non-alphanumeric + # characters in an underlined phrase, and the above was trying + # to detect that. It does not work all the time, and it + # screws up other text, so a simplified expression is used. + + s/((_\010[^_])+)/emphasize($1)/oge; + + $secth = 0; + ## Check for strong text and headings + if ($Sun || /.\010./o) { + if (!$NoHeads) { + $line =~ s/.\010//go; + $tmp = $HeadFallback; +EndOfDoItCode + + ## Create switch statement for detecting a heading + ## + $doitcode .= "HEADSW: {\n"; + foreach $head (keys %SectionHead) { + $doitcode .= join("", "\$tmp = '$SectionHead{$head}', ", + "\$secth = 1, last HEADSW ", + "if \$line =~ /^$leftm$head/o;\n"); + } + $doitcode .= "}\n"; + + ## Rest of routine + ## + $doitcode .=<<'EndOfDoItCode'; + if ($secth || $line =~ /^$leftm\S/o) { + chop $line; + $_ = $tmp . $line . $tmp; + s%<([^>]*)>$%</$1>%; + $_ = "\n</PRE>\n" . $_ . "<PRE>\n"; + } else { + s/(((.\010)+.)+)/strongize($1)/oge; + } + } else { + s/(((.\010)+.)+)/strongize($1)/oge; + } + } + print $OutFH $_; + } + + for ($i=0; $i < $ftsz; $i++) { + last LOOP unless defined($_ = <$InFH>); + } + } +EndOfDoItCode + + + ## Perform processing. + + printhead() unless $Bare; + print $OutFH "<PRE>\n"; + eval $doitcode; # $doitcode defined above + print $OutFH "</PRE>\n"; + printtail() unless $Bare; +} + +##--------------------------------------------------------------------------- +## +sub get_cli_opts { + return 0 unless + GetOptions( + "bare", # Leave out HTML, HEAD, BODY tags. + "belem=s", # HTML Element for overstriked text (def: "B") + "botm=i", # Number of lines for bottom margin (def: 7) + "cgiurl=s", # CGI URL for linking to other manpages + "cgiurlexp=s", # CGI URL Perl expr for linking to other manpages + "compress", # Compress consecutive blank lines + "headmap=s", # Filename of user section head map file + "k", # Process input from 'man -k' output. + "leftm=i", # Character width of left margin (def: 0) + "nodepage", # Do not remove pagination lines + "noheads", # Do not detect for section heads + "pgsize=i", # Number of lines in a page (def: 66) + "seealso", # Link to other manpages only in the SEE ALSO section + "solaris", # Parse 'man -k' output from a solaris system + "sun", # Section heads are not overstriked in input + "title=s", # Title of manpage (def: Not defined) + "topm=i", # Number of lines for top margin (def: 7) + "uelem=s", # HTML Element for underlined text (def: "I") + + "help" # Short usage message + ); + return 0 if defined($opt_help); + + $pgsz = $opt_pgsize || $pgsz; + if (defined($opt_nodepage)) { + $hdsz = 0; + $ftsz = 0; + } else { + $hdsz = $opt_topm if defined($opt_topm); + $ftsz = $opt_botm if defined($opt_botm); + } + $txsz = $pgsz - ($hdsz + $ftsz); + $leftmsz = $opt_leftm if defined($opt_leftm); + $leftm = ' ' x $leftmsz; + + $Bare = defined($opt_bare); + $Compress = defined($opt_compress); + $K = defined($opt_k); + $NoDepage = defined($opt_nodepage); + $NoHeads = defined($opt_noheads); + $SeeAlso = defined($opt_seealso); + $Solaris = defined($opt_solaris); + $Sun = defined($opt_sun); + + $Title = $opt_title || $Title; + $CgiUrl = $opt_cgiurlexp || + ($opt_cgiurl ? qq{return "$opt_cgiurl"} : ''); + + $BTag = $opt_belem || $BTag; + $UTag = $opt_uelem || $UTag; + $BTag =~ s/[<>]//g; + $UTag =~ s/[<>]//g; + + if (defined($opt_headmap)) { + require $opt_headmap or warn "Unable to read $opt_headmap\n"; + } + 1; +} + +##--------------------------------------------------------------------------- +sub printhead { + print $OutFH <<EndOfMeta; +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"> +<HTML> +<HEAD> + <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1"> + <TITLE>$Title</TITLE> + <META http-equiv="Content-Style-Type" content="text/css"> + <STYLE type="text/css"> + BODY {background-color:white; color:black} + ADDRESS {font-size:smaller} + IMG.logo {width:6em; vertical-align:middle} + </STYLE> +</HEAD> +<BODY> +EndOfMeta +} + +##--------------------------------------------------------------------------- +sub printtail { + print $OutFH <<\EndOfRef; +<HR> +<ADDRESS> +Man(1) output converted with +<a href="http://www.oac.uci.edu/indiv/ehood/man2html.html">man2html</a> +modified for the DCC $Date 2001/04/29 03:22:18 $ +<BR> +<A HREF="http://www.dcc-servers.net/dcc/"> + <IMG SRC="http://logos.dcc-servers.net/border.png" + class=logo ALT="DCC logo"> + </A> +<A HREF="http://validator.w3.org/check?uri=referer"> + <IMG class=logo ALT="Valid HTML 4.01 Strict" + SRC="http://www.w3.org/Icons/valid-html401"> + </A> +</ADDRESS> +</BODY> +</HTML> +EndOfRef +} + +##--------------------------------------------------------------------------- +sub emphasize { + my($txt) = shift; + $txt =~ s/.\010//go; + $txt = "<$UTag>$txt</$UTag>"; + $txt; +} + +##--------------------------------------------------------------------------- +sub strongize { + my($txt) = shift; + $txt =~ s/.\010//go; + $txt = "<$BTag>$txt</$BTag>"; + $txt; +} + +##--------------------------------------------------------------------------- +sub entitize { + my($txt) = shift; + + ## Check for special characters in overstrike text ## + $$txt =~ s/_\010\&/strike('_', '&')/geo; + $$txt =~ s/_\010</strike('_', '<')/geo; + $$txt =~ s/_\010>/strike('_', '>')/geo; + + $$txt =~ s/(\&\010)+\&/strike('&', '&')/geo; + $$txt =~ s/(<\010)+</strike('<', '<')/geo; + $$txt =~ s/(>\010)+>/strike('>', '>')/geo; + + ## Check for special characters in regular text. Must be careful + ## to check before/after character in expression because it might be + ## a special character. + $$txt =~ s/([^\010]\&[^\010])/htmlize2($1)/geo; + $$txt =~ s/([^\010]<[^\010])/htmlize2($1)/geo; + $$txt =~ s/([^\010]>[^\010])/htmlize2($1)/geo; +} + +##--------------------------------------------------------------------------- +## escape special characters in a string, in-place +## +sub htmlize { + my($str) = shift; + $$str =~ s/&/\&/g; + $$str =~ s/</\</g; + $$str =~ s/>/\>/g; + $$str; +} + +##--------------------------------------------------------------------------- +## htmlize2() is used by entitize. +## +sub htmlize2 { + my($str) = shift; + $str =~ s/&/\&/g; + $str =~ s/</\</g; + $str =~ s/>/\>/g; + $str; +} + +##--------------------------------------------------------------------------- +## strike converts HTML special characters in overstriked text +## into entity references. The entities are overstriked so +## strongize() and emphasize() will recognize the entity to be +## wrapped in tags. +## +sub strike { + my($w, $char) = @_; + my($ret); + if ($w eq '_') { + if ($char eq '&') { + $ret = "_$bs\&_${bs}a_${bs}m_${bs}p_${bs};"; + } elsif ($char eq '<') { + $ret = "_$bs\&_${bs}l_${bs}t_${bs};"; + } elsif ($char eq '>') { + $ret = "_$bs\&_${bs}g_${bs}t_${bs};"; + } else { + warn qq|Unrecognized character, "$char", passed to strike()\n|; + } + } else { + if ($char eq '&') { + $ret = "\&$bs\&a${bs}am${bs}mp${bs}p;${bs};"; + } elsif ($char eq '<') { + $ret = "\&$bs\&l${bs}lt${bs}t;${bs};"; + } elsif ($char eq '>') { + $ret = "\&$bs\&g${bs}gt${bs}t;${bs};"; + } else { + warn qq|Unrecognized character, "$char", passed to strike()\n|; + } + } + $ret; +} + +##--------------------------------------------------------------------------- +## make_xref() converts a manpage crossreference into a hyperlink. +## +sub make_xref { + my $str = shift; + $str =~ s/.\010//go; # Remove overstriking + + if ($CgiUrl) { + my($title,$section,$subsection) = + ($str =~ /([\+_\.\w-]+)\((\d)(\w?)\)/); + + $title =~ s/\+/%2B/g; + my($href) = (eval $CgiUrl); + qq|<B><A HREF="$href">$str</A></B>|; + } else { + qq|<B>$str</B>|; + } +} + +##--------------------------------------------------------------------------- +## man_k() process a keyword search. The problem we have is there +## is no standard for keyword search results from man. Solaris +## systems have a different enough format to warrent dealing +## with it as a special case. For other cases, we try our best. +## Unfortunately, there are some lines of results that may be +## skipped. +## +sub man_k { + my($line,$refs,$section,$subsection,$desc,$i, + %Sec1, %Sec1sub, %Sec2, %Sec2sub, %Sec3, %Sec3sub, + %Sec4, %Sec4sub, %Sec5, %Sec5sub, %Sec6, %Sec6sub, + %Sec7, %Sec7sub, %Sec8, %Sec8sub, %Sec9, %Sec9sub, + %SecN, %SecNsub, %SecNsec); + + printhead() unless $Bare; + print $OutFH "<!-- Man keyword results converted by ", + "man2html $VERSION -->\n"; + + while ($line = <$InFH>) { + next if $line !~ /\(\d\w?\)\s+-\s/; # check if line can be handled + ($refs,$section,$subsection,$desc) = + $line =~ /^\s*(.*)\((\d)(\w?)\)\s*-\s*(.*)$/; + + if ($Solaris) { + $refs =~ s/^\s*([\+_\.\w-]+)\s+([\+_\.\w-]+)\s*$/$1/; + # <topic> <manpage> + } else { + $refs =~ s/\s(and|or)\s/,/gi; # Convert and/or to commas + $refs =~ s/^[^:\s]:\s*//; # Remove prefixed whatis path + } + $refs =~ s/\s//g; # Remove all whitespace + $refs =~ s/,/, /g; # Put space after comma + htmlize(\$desc); # Check for special chars in desc + $desc =~ s/^(.)/\U$1/; # Uppercase first letter in desc + + if ($section eq '1') { + $Sec1{$refs} = $desc; $Sec1sub{$refs} = $subsection; + } elsif ($section eq '2') { + $Sec2{$refs} = $desc; $Sec2sub{$refs} = $subsection; + } elsif ($section eq '3') { + $Sec3{$refs} = $desc; $Sec3sub{$refs} = $subsection; + } elsif ($section eq '4') { + $Sec4{$refs} = $desc; $Sec4sub{$refs} = $subsection; + } elsif ($section eq '5') { + $Sec5{$refs} = $desc; $Sec5sub{$refs} = $subsection; + } elsif ($section eq '6') { + $Sec6{$refs} = $desc; $Sec6sub{$refs} = $subsection; + } elsif ($section eq '7') { + $Sec7{$refs} = $desc; $Sec7sub{$refs} = $subsection; + } elsif ($section eq '8') { + $Sec8{$refs} = $desc; $Sec8sub{$refs} = $subsection; + } elsif ($section eq '9') { + $Sec9{$refs} = $desc; $Sec9sub{$refs} = $subsection; + } else { # Catch all + $SecN{$refs} = $desc; $SecNsec{$refs} = $section; + $SecNsub{$refs} = $subsection; + } + } + print_mank_sec(\%Sec1, 1, \%Sec1sub); + print_mank_sec(\%Sec2, 2, \%Sec2sub); + print_mank_sec(\%Sec3, 3, \%Sec3sub); + print_mank_sec(\%Sec4, 4, \%Sec4sub); + print_mank_sec(\%Sec5, 5, \%Sec5sub); + print_mank_sec(\%Sec6, 6, \%Sec6sub); + print_mank_sec(\%Sec7, 7, \%Sec7sub); + print_mank_sec(\%Sec8, 8, \%Sec8sub); + print_mank_sec(\%Sec9, 9, \%Sec9sub); + print_mank_sec(\%SecN, 'N', \%SecNsub, \%SecNsec); + + printtail() unless $Bare; +} +##--------------------------------------------------------------------------- +## print_mank_sec() prints out manpage cross-refs of a specific section. +## +sub print_mank_sec { + my($sec, $sect, $secsub, $secsec) = @_; + my(@array, @refs, $href, $item, $title, $subsection, $i, $section, + $xref); + $section = $sect; + + @array = sort keys %$sec; + if ($#array >= 0) { + print $OutFH "<H2>Section $section</H2>\n", + "<DL COMPACT>\n"; + foreach $item (@array) { + @refs = split(/,/, $item); + $section = $secsec->{$item} if $sect eq 'N'; + $subsection = $secsub->{$item}; + if ($CgiUrl) { + ($title = $refs[0]) =~ s/\(\)//g; # watch out for extra ()'s + $xref = eval $CgiUrl; + } + print $OutFH "<DT>\n"; + $i = 0; + foreach (@refs) { + if ($CgiUrl) { + print $OutFH qq|<B><A HREF="$xref">$_</A></B>|; + } else { + print $OutFH $_; + } + print $OutFH ", " if $i < $#refs; + $i++; + } + print $OutFH " ($section$subsection)\n", + "</DT><DD>\n", + $sec->{$item}, "</DD>\n"; + } + print $OutFH "</DL>\n"; + } +} + +##--------------------------------------------------------------------------- +## +sub usage { + print $OutFH <<EndOfUsage; +Usage: $PROG [ options ] < infile > outfile +Options: + -bare : Do not put in HTML, HEAD, BODY tags + -belem <elem> : HTML Element for overstriked text (def: "B") + -botm <#> : Number of lines for bottom margin (def: 7) + -cgiurl <url> : URL for linking to other manpages + -cgiurlexp <url> : Perl expression URL for linking to other manpages + -compress : Compress consective blank lines + -headmap <file> : Filename of user section head map file + -help : This message + -k : Process a keyword search result + -leftm <#> : Character width of left margin (def: 0) + -nodepage : Do not remove pagination lines + -noheads : Turn off section head detection + -pgsize <#> : Number of lines in a page (def: 66) + -seealso : Link to other manpages only in the SEE ALSO section + -solaris : Process keyword search result in Solaris format + -sun : Section heads are not overstriked in input + -title <string> : Title of manpage (def: Not defined) + -topm <#> : Number of lines for top margin (def: 7) + -uelem <elem> : HTML Element for underlined text (def: "I") + +Description: + $PROG takes formatted manpages from STDIN and converts it to HTML sent + to STDOUT. The -topm and -botm arguments are the number of lines to the + main body text and NOT to the running headers/footers. + +Version: + $VERSION + Copyright (C) 1995-1997 Earl Hood, ehood\@medusa.acs.uci.edu + $PROG comes with ABSOLUTELY NO WARRANTY and $PROG may be copied only + under the terms of the GNU General Public License, which may be found in + the $PROG distribution. + +EndOfUsage + exit 0; +}