dist2/132html - nest-cam/4320010/pcre2 - Git at Google

 #! /usr/bin/perl -w

 # Script to turn PCRE2 man pages into HTML


 # Subroutine to handle font changes and other escapes

 sub do_line {
 my($s) = $_[0];

 $s =~ s/</&#60;/g;                   # Deal with < and >
 $s =~ s/>/&#62;/g;
 $s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
 $s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
 $s =~ s"\\e"\\"g;
 $s =~ s/(?<=Copyright )\(c\)/&copy;/g;
 $s;
 }

 # Subroutine to ensure not in a paragraph

 sub end_para {
 if ($inpara)
   {
   print TEMP "</PRE>\n" if ($inpre);
   print TEMP "</P>\n";
   }
 $inpara = $inpre = 0;
 $wrotetext = 0;
 }

 # Subroutine to start a new paragraph

 sub new_para {
 &end_para();
 print TEMP "<P>\n";
 $inpara = 1;
 }


 # Main program

 $innf = 0;
 $inpara = 0;
 $inpre = 0;
 $wrotetext = 0;
 $toc = 0;
 $ref = 1;

 while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
   {
   $toc = 1 if $ARGV[0] eq "-toc";
   shift;
   }

 # Initial output to STDOUT

 print <<End ;
 <html>
 <head>
 <title>$ARGV[0] specification</title>
 </head>
 <body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
 <h1>$ARGV[0] man page</h1>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
 </p>
 <p>
 This page is part of the PCRE2 HTML documentation. It was generated
 automatically from the original man page. If there is any nonsense in it,
 please consult the man page, in case the conversion went wrong.
 <br>
 End

 print "<ul>\n" if ($toc);

 open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";

 while (<STDIN>)
   {
   # Handle lines beginning with a dot

   if (/^\./)
     {
     # Some of the PCRE2 man pages used to contain instances of .br. However,
     # they should have all been removed because they cause trouble in some
     # (other) automated systems that translate man pages to HTML. Complain if
     # we find .br or .in (another macro that is deprecated).

     if (/^\.br/ || /^\.in/)
       {
       print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
       print STDERR "*** $_\n";
       die "*** Processing abandoned\n";
       }

     # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.

     elsif (/^\.nf/)
       {
       $innf = 1;
       }

     elsif (/^\.fi/)
       {
       $innf = 0;
       }

     # Handling .sp is subtle. If it is inside a literal section, do nothing if
     # the next line is a non literal text line; similarly, if not inside a
     # literal section, do nothing if a literal follows, unless we are inside
     # a .nf/.ne section. The point being that the <pre> and </pre> that delimit
     # literal sections will do the spacing. Always skip if no previous output.

     elsif (/^\.sp/)
       {
       if ($wrotetext)
         {
         $_ = <STDIN>;
         if ($inpre)
           {
           print TEMP "\n" if (/^[\s.]/);
           }
         else
           {
           print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/);
           }
         redo;    # Now process the lookahead line we just read
         }
       }
     elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
       {
       &new_para();
       }
     elsif (/^\.SH\s*("?)(.*)\1/)
       {
       # Ignore the NAME section
       if ($2 =~ /^NAME\b/)
         {
         <STDIN>;
         next;
         }

       &end_para();
       my($title) = &do_line($2);
       if ($toc)
         {
         printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
           $ref, $ref);
         printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
           $ref);
         $ref++;
         }
       else
         {
         print TEMP "<br><b>\n$title\n</b><br>\n";
         }
       }
     elsif (/^\.SS\s*("?)(.*)\1/)
       {
       &end_para();
       my($title) = &do_line($2);
       print TEMP "<br><b>\n$title\n</b><br>\n";
       }
     elsif (/^\.B\s*(.*)/)
       {
       &new_para() if (!$inpara);
       $_ = &do_line($1);
       s/"(.*?)"/$1/g;
       print TEMP "<b>$_</b>\n";
       $wrotetext = 1;
       }
     elsif (/^\.I\s*(.*)/)
       {
       &new_para() if (!$inpara);
       $_ = &do_line($1);
       s/"(.*?)"/$1/g;
       print TEMP "<i>$_</i>\n";
       $wrotetext = 1;
       }

     # A comment that starts "HREF" takes the next line as a name that
     # is turned into a hyperlink, using the text given, which might be
     # in a special font. If it ends in () or (digits) or punctuation, they
     # aren't part of the link.

     elsif (/^\.\\"\s*HREF/)
       {
       $_=<STDIN>;
       chomp;
       $_ = &do_line($_);
       $_ =~ s/\s+$//;
       $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
       print TEMP "<a href=\"$1.html\">$_</a>\n";
       }

     # A comment that starts "HTML" inserts literal HTML

     elsif (/^\.\\"\s*HTML\s*(.*)/)
       {
       print TEMP $1;
       }

     # A comment that starts < inserts that HTML at the end of the
     # *next* input line - so as not to get a newline between them.

     elsif (/^\.\\"\s*(<.*>)/)
       {
       my($markup) = $1;
       $_=<STDIN>;
       chomp;
       $_ = &do_line($_);
       $_ =~ s/\s+$//;
       print TEMP "$_$markup\n";
       }

     # A comment that starts JOIN joins the next two lines together, with one
     # space between them. Then that line is processed. This is used in some
     # displays where two lines are needed for the "man" version. JOINSH works
     # the same, except that it assumes this is a shell command, so removes
     # continuation backslashes.

     elsif (/^\.\\"\s*JOIN(SH)?/)
       {
       my($one,$two);
       $one = <STDIN>;
       $two = <STDIN>;
       $one =~ s/\s*\\e\s*$// if (defined($1));
       chomp($one);
       $two =~ s/^\s+//;
       $_ = "$one $two";
       redo;            # Process the joined lines
       }

     # .EX/.EE are used in the pcre2demo page to bracket the entire program,
     # which is unmodified except for turning backslash into "\e".

     elsif (/^\.EX\s*$/)
       {
       print TEMP "<PRE>\n";
       while (<STDIN>)
         {
         last if /^\.EE\s*$/;
         s/\\e/\\/g;
         s/&/&amp;/g;
         s/</&lt;/g;
         s/>/&gt;/g;
         print TEMP;
         }
       }

     # Ignore anything not recognized

     next;
     }

   # Line does not begin with a dot. Replace blank lines with new paragraphs

   if (/^\s*$/)
     {
     &end_para() if ($wrotetext);
     next;
     }

   # Convert fonts changes and output an ordinary line. Ensure that indented
   # lines are marked as literal.

   $_ = &do_line($_);
   &new_para() if (!$inpara);

   if (/^\s/)
     {
     if (!$inpre)
       {
       print TEMP "<pre>\n";
       $inpre = 1;
       }
     }
   elsif ($inpre)
     {
     print TEMP "</pre>\n";
     $inpre = 0;
     }

   # Add <br> to the end of a non-literal line if we are within .nf/.fi

   $_ .= "<br>\n" if (!$inpre && $innf);

   print TEMP;
   $wrotetext = 1;
   }

 # The TOC, if present, will have been written - terminate it

 print "</ul>\n" if ($toc);

 # Copy the remainder to the standard output

 close(TEMP);
 open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";

 print while (<TEMP>);

 print <<End ;
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
 </p>
 End

 close(TEMP);
 unlink("/tmp/$$");

 # End
	#! /usr/bin/perl -w

	# Script to turn PCRE2 man pages into HTML


	# Subroutine to handle font changes and other escapes

	sub do_line {
	my($s) = $_[0];

	$s =~ s/</</g; # Deal with < and >
	$s =~ s/>/>/g;
	$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
	$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
	$s =~ s"\\e"\\"g;
	$s =~ s/(?<=Copyright )\(c\)/©/g;
	$s;
	}

	# Subroutine to ensure not in a paragraph

	sub end_para {
	if ($inpara)
	{
	print TEMP "</PRE>\n" if ($inpre);
	print TEMP "</P>\n";
	}
	$inpara = $inpre = 0;
	$wrotetext = 0;
	}

	# Subroutine to start a new paragraph

	sub new_para {
	&end_para();
	print TEMP "<P>\n";
	$inpara = 1;
	}


	# Main program

	$innf = 0;
	$inpara = 0;
	$inpre = 0;
	$wrotetext = 0;
	$toc = 0;
	$ref = 1;

	while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
	{
	$toc = 1 if $ARGV[0] eq "-toc";
	shift;
	}

	# Initial output to STDOUT

	print <<End ;
	<html>
	<head>
	<title>$ARGV[0] specification</title>
	</head>
	<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
	<h1>$ARGV[0] man page</h1>
	<p>
	Return to the <a href="index.html">PCRE2 index page</a>.
	</p>
	<p>
	This page is part of the PCRE2 HTML documentation. It was generated
	automatically from the original man page. If there is any nonsense in it,
	please consult the man page, in case the conversion went wrong.
	<br>
	End

	print "<ul>\n" if ($toc);

	open(TEMP, ">/tmp/$$") \|\| die "Can't open /tmp/$$ for output\n";

	while (<STDIN>)
	{
	# Handle lines beginning with a dot

	if (/^\./)
	{
	# Some of the PCRE2 man pages used to contain instances of .br. However,
	# they should have all been removed because they cause trouble in some
	# (other) automated systems that translate man pages to HTML. Complain if
	# we find .br or .in (another macro that is deprecated).

	if (/^\.br/ \|\| /^\.in/)
	{
	print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
	print STDERR "*** $_\n";
	die "*** Processing abandoned\n";
	}

	# Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.

	elsif (/^\.nf/)
	{
	$innf = 1;
	}

	elsif (/^\.fi/)
	{
	$innf = 0;
	}

	# Handling .sp is subtle. If it is inside a literal section, do nothing if
	# the next line is a non literal text line; similarly, if not inside a
	# literal section, do nothing if a literal follows, unless we are inside
	# a .nf/.ne section. The point being that the <pre> and </pre> that delimit
	# literal sections will do the spacing. Always skip if no previous output.

	elsif (/^\.sp/)
	{
	if ($wrotetext)
	{
	$_ = <STDIN>;
	if ($inpre)
	{
	print TEMP "\n" if (/^[\s.]/);
	}
	else
	{
	print TEMP "<br>\n<br>\n" if ($innf \|\| !/^[\s.]/);
	}
	redo; # Now process the lookahead line we just read
	}
	}
	elsif (/^\.TP/ \|\| /^\.PP/ \|\| /^\.P/)
	{
	&new_para();
	}
	elsif (/^\.SH\s("?)(.)\1/)
	{
	# Ignore the NAME section
	if ($2 =~ /^NAME\b/)
	{
	<STDIN>;
	next;
	}

	&end_para();
	my($title) = &do_line($2);
	if ($toc)
	{
	printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
	$ref, $ref);
	printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
	$ref);
	$ref++;
	}
	else
	{
	print TEMP "<br><b>\n$title\n</b><br>\n";
	}
	}
	elsif (/^\.SS\s("?)(.)\1/)
	{
	&end_para();
	my($title) = &do_line($2);
	print TEMP "<br><b>\n$title\n</b><br>\n";
	}
	elsif (/^\.B\s(.)/)
	{
	&new_para() if (!$inpara);
	$_ = &do_line($1);
	s/"(.*?)"/$1/g;
	print TEMP "<b>$_</b>\n";
	$wrotetext = 1;
	}
	elsif (/^\.I\s(.)/)
	{
	&new_para() if (!$inpara);
	$_ = &do_line($1);
	s/"(.*?)"/$1/g;
	print TEMP "<i>$_</i>\n";
	$wrotetext = 1;
	}

	# A comment that starts "HREF" takes the next line as a name that
	# is turned into a hyperlink, using the text given, which might be
	# in a special font. If it ends in () or (digits) or punctuation, they
	# aren't part of the link.

	elsif (/^\.\\"\s*HREF/)
	{
	$_=<STDIN>;
	chomp;
	$_ = &do_line($_);
	$_ =~ s/\s+$//;
	$_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
	print TEMP "<a href=\"$1.html\">$_</a>\n";
	}

	# A comment that starts "HTML" inserts literal HTML

	elsif (/^\.\\"\sHTML\s(.*)/)
	{
	print TEMP $1;
	}

	# A comment that starts < inserts that HTML at the end of the
	# next input line - so as not to get a newline between them.

	elsif (/^\.\\"\s(<.>)/)
	{
	my($markup) = $1;
	$_=<STDIN>;
	chomp;
	$_ = &do_line($_);
	$_ =~ s/\s+$//;
	print TEMP "$_$markup\n";
	}

	# A comment that starts JOIN joins the next two lines together, with one
	# space between them. Then that line is processed. This is used in some
	# displays where two lines are needed for the "man" version. JOINSH works
	# the same, except that it assumes this is a shell command, so removes
	# continuation backslashes.

	elsif (/^\.\\"\s*JOIN(SH)?/)
	{
	my($one,$two);
	$one = <STDIN>;
	$two = <STDIN>;
	$one =~ s/\s\\e\s$// if (defined($1));
	chomp($one);
	$two =~ s/^\s+//;
	$_ = "$one $two";
	redo; # Process the joined lines
	}

	# .EX/.EE are used in the pcre2demo page to bracket the entire program,
	# which is unmodified except for turning backslash into "\e".

	elsif (/^\.EX\s*$/)
	{
	print TEMP "<PRE>\n";
	while (<STDIN>)
	{
	last if /^\.EE\s*$/;
	s/\\e/\\/g;
	s/&/&/g;
	s/</</g;
	s/>/>/g;
	print TEMP;
	}
	}

	# Ignore anything not recognized

	next;
	}

	# Line does not begin with a dot. Replace blank lines with new paragraphs

	if (/^\s*$/)
	{
	&end_para() if ($wrotetext);
	next;
	}

	# Convert fonts changes and output an ordinary line. Ensure that indented
	# lines are marked as literal.

	$_ = &do_line($_);
	&new_para() if (!$inpara);

	if (/^\s/)
	{
	if (!$inpre)
	{
	print TEMP "<pre>\n";
	$inpre = 1;
	}
	}
	elsif ($inpre)
	{
	print TEMP "</pre>\n";
	$inpre = 0;
	}

	# Add <br> to the end of a non-literal line if we are within .nf/.fi

	$_ .= "<br>\n" if (!$inpre && $innf);

	print TEMP;
	$wrotetext = 1;
	}

	# The TOC, if present, will have been written - terminate it

	print "</ul>\n" if ($toc);

	# Copy the remainder to the standard output

	close(TEMP);
	open(TEMP, "/tmp/$$") \|\| die "Can't open /tmp/$$ for input\n";

	print while (<TEMP>);

	print <<End ;
	<p>
	Return to the <a href="index.html">PCRE2 index page</a>.
	</p>
	End

	close(TEMP);
	unlink("/tmp/$$");

	# End