# This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # # Wrapped by Ti Lian Hwang <tilh@mars> on Thu Jul 6 15:14:10 1995 # # This archive contains: # dummy.pl h2a_pcl.pl parse-html.pl html-ascii.pl # LANG=""; export LANG PATH=/bin:/usr/bin:$PATH; export PATH echo x - dummy.pl cat >dummy.pl <<'@EOF' # put your drivers in a file # 'h2a_$TERM.pl' where $TERM is the environment variable # sub html_begin_doc {} sub html_end_doc {} sub begin_font {} sub end_font {} 1; @EOF chmod 664 dummy.pl echo x - h2a_pcl.pl cat >h2a_pcl.pl <<'@EOF' # # HP-PCL font functions to be used by html-to-ascii.pl # "required" by parse-html" # # Author : Ti Lian Hwang # email : tilh@sin-ro.sin-ro.dhl.com # # # fixed spacing, 12 pitch, 12 points, Times Roman $normal_font = "\033(s0p12h12v3T"; sub html_begin_doc { # reset, CR=CR,LF=LF+CR,FF=FF+CR print "\033E\033&k2G"; } sub html_end_doc { print "\033E"; } $Begin{"B"} = "begin_font"; $End{"B"} = "end_font"; $Begin{"I"} = "begin_font"; $End{"I"} = "end_font"; $Begin{"U"} = "begin_font"; $End{"U"} = "end_font"; $Begin{"EM"} = "begin_font"; $End{"EM"} = "end_font"; %font_begin = ( "PRE","\033(s0p12h3T", "H1","\033(s1p20v5t3b", "H2","\033(s1p18v5t3b", "H3","\033(s1p16v5t3B", "H4","\033(s1p14v5t3B", "H5","\033(s1p12v5t3B", "H6","\033(s1p12v5t3B", "B","\033(s3B", "U","\033&dD", "I","\033(s1S", "EM","\033(s3B" ); %font_end = ( "PRE",$normal_font, "H1",$normal_font . "\033(s0B", "H2",$normal_font . "\033(s0B", "H3",$normal_font . "\033(s0B", "H4",$normal_font . "\033(s0B", "H5",$normal_font . "\033(s0B", "H6",$normal_font . "\033(s0B", "B","\033(s0B", "U","\033&d@", "I","\033(s0S", "EM","\033(s0B" ); sub begin_font { local ($element, $tag) = @_; print $font_begin{$element}; } sub end_font { local ($element, $tag) = @_; print $font_end{$element}; } 1; @EOF chmod 664 h2a_pcl.pl echo x - parse-html.pl sed 's/^@//' >parse-html.pl <<'@EOF' # HTML parser # Jim Davis, July 15 1994 # This is an HTML parser not an SGML parser. It does not parse a DTD, # The DTD is implicit in the code, and specific to HTML. # The processing of the HTML can be customized by the user by # 1) Defining routines to be called for various tags (see Begin and End arrays) # 2) Defining routines html_content and html_whitespace # This is not a validating parser. It does not check the content model # eg you can use DT outside a DL and it won't know. It is too liberal in # what tags are allowed to minimize what other tags. # Bugs - can't parse the prolog or whatever you call it # <!DOCTYPE HTML [ # <!entity % HTML.Minimal "INCLUDE"> # <!-- Include standard HTML DTD --> # <!ENTITY % html PUBLIC "-//connolly hal.com//DTD WWW HTML 1.8//EN"> # %html; # ]> # modified 3 Aug to add a bunch of HTML 2.0 tags # modified 3 Sept to print HTML stack to STDERR not STDOUT, to add new # routines html_begin_doc and html_end_doc for application specific cleanup # and to break parse_html into two pieces. # modified 30 Sept 94. parse_attributes now handles tag attributes that # don't have values. thanks to Bill Simpson-Young <bill@syd.dit.csiro.au> # for the code. # modified 17 Apr 95 to support FORMS tags. # # modified 6 July 1995 by Ti Lian Hwang <tilh@sin-co.sin-ro.dhl.com> # to handle 'printer drivers' - files to be 'require' depending on $TERM # files are to have name of 'h2a_$TERM.pl' # $filename = "h2a_" . $ENV{"TERM"} . ".pl"; foreach $prefix (@INC) { $realfilename = "$prefix/$filename"; if (-f $realfilename) { require "$filename" ; $filename = ""; last; } } if ($filename) {require "dummy.pl" ; } $debug = 0; $whitespace_significant = 0; # global variables: # $line_buffer is line buffer # $line_count is input line number. $line_buffer = ""; $line_count = 0; sub parse_html { local ($file) = @_; open (HTML, $file) || die "Could not open $file: $!\nStopped"; &parse_html_stream (); close (HTML);} # Global input HTML is the handle to the stream of HTML sub parse_html_stream { local ($token, $new); ## initialization @stack=(); $line_count = 0; $line_buffer = ""; ## application specific initialization &html_begin_doc(); main: while (1) { # if whitespace does not matter, trim any leading space. if (! $whitespace_significant) { $line_buffer =~ s/^\s+//;} # now dispatch on the type of token if ($line_buffer =~ /^(\s+)/) { $token = $1; $line_buffer = $'; &html_whitespace ($token);} # This will lose if there is more than one comment on the line! elsif ($line_buffer =~ /^(\<!--.*-->)/) { $token = $1; $line_buffer = $'; &html_comment ($token);} elsif ($line_buffer =~ /^(\<![^-][^\>]*\>)/) { $token = $1; $line_buffer = $'; &html_comment ($token);} elsif ($line_buffer =~ /^(\<\/[^\>]*\>)/) { $token = $1; $line_buffer = $'; &html_etag ($token);} elsif ($line_buffer =~ /^(\<[^!\/][^\>]*\>)/) { $token = $1; $line_buffer = $'; &html_tag ($token);} elsif ($line_buffer =~ /^([^\s<]+)/) { $token = $1; $line_buffer = $'; $token = &substitute_entities($token); &html_content ($token); } else { # No valid token in buffer. Maybe it's empty, or maybe there's an # incomplete tag. So get some more data. $new = <HTML>; if (! defined ($new)) {last main;} # if we're trying to find a match for a tag, then get rid of embedded newline # this is, I think, a kludge if ($line_buffer =~ /^\</ && $line_buffer =~ /\n$/) { chop $line_buffer; $line_buffer .= " ";} $line_buffer .= $new; $line_count++;} } ## cleanup &html_end_doc(); if ($#stack > -1) { print STDERR "Stack not empty at end of document\n"; &print_html_stack();} } sub html_tag { local ($tag) = @_; local ($element) = &tag_element ($tag); local (%attributes) = &tag_attributes ($tag); # the tag might minimize (be an implicit end) for the previous tag local ($prev_element); while (&Minimizes(&stack_top_element(), $element)) { $prev_element = &stack_pop_element (); if ($debug) { print STDERR "MINIMIZING $prev_element with $element on $line_count\n";} &html_end ($prev_element, 0);} push (@stack, $tag); &html_begin ($element, $tag, *attributes); if (&Empty($element)) { pop(@stack); &html_end ($element, 0);} } sub html_etag { local ($tag) = @_; local ($element) = &tag_element ($tag); # pop stack until find matching tag. This is probably a bad idea, # or at least too general. local ( $prev_element) = &stack_pop_element(); until ($prev_element eq $element) { if ($debug) { print STDERR "MINIMIZING $prev_element with /$element on $line_count \n";} &html_end ($prev_element, 0); if ($#stack == -1) { print STDERR "No match found for /$element. You will lose\n"; last;} $prev_element = &stack_pop_element();} &html_end ($element, 1); } # For each element, the names of elements which minimize it. # This is of course totally HTML dependent and probably I have it wrong too $Minimize{"DT"} = "DT:DD"; $Minimize{"DD"} = "DT"; $Minimize{"LI"} = "LI"; $Minimize{"P"} = "P:DT:LI:H1:H2:H3:H4:BLOCKQUOTE:UL:OL:DL"; # Does element E2 minimize E1? sub Minimizes { local ($e1, $e2) = @_; local ($value) = 0; foreach $elt (split (":", $Minimize{$e1})) { if ($elt eq $e2) {$value = 1;}} $value;} $Empty{"BASE"} = 1; $Empty{"BR"} = 1; $Empty{"HR"} = 1; $Empty{"IMG"} = 1; $Empty{"ISINDEX"} = 1; $Empty{"LINK"} = 1; $Empty{"META"} = 1; $Empty{"NEXTID"} = 1; $Empty{"INPUT"} = 1; # Empty tags have no content and hence no end tags sub Empty { local ($element) = @_; $Empty{$element};} sub print_html_stack { print STDERR "\n ==\n"; foreach $elt (reverse @stack) {print STDERR " $elt\n";} print STDERR " ==========\n";} # The element on top of stack, if any. sub stack_top_element { if ($#stack >= 0) { &tag_element ($stack[$#stack]);}} sub stack_pop_element { &tag_element (pop (@stack));} # The element from the tag, normalized. sub tag_element { local ($tag) = @_; $tag =~ /<\/?([^\s>]+)/; local ($element) = $1; $element =~ tr/a-z/A-Z/; $element;} # associative array of the attributes of a tag. sub tag_attributes { local ($tag) = @_; $tag =~ /^<[A-Za-z]+ +(.*)>$/; &parse_attributes($1);} # string should be something like # KEY="value" KEY2="longer value" KEY3="tags o doom" # output is an associative array (like a lisp property list) # attributes names are not case sensitive, do I downcase them # Maybe (probably) I should substitute for entities when parsing attributes. sub parse_attributes { local ($string) = @_; local (%attributes); local ($name, $val); get: while (1) { if ($string =~ /^ *([A-Za-z]+)=\"([^\"]*)\"/) { $name = $1; $val = $2; $string = $'; $name =~ tr/A-Z/a-z/; $attributes{$name} = $val; } elsif ($string =~ /^ *([A-Za-z]+)=(\S*)/) { $name = $1; $val = $2; $string = $'; $name =~ tr/A-Z/a-z/; $attributes{$name} = $val;} elsif ($string =~ /^ *([A-Za-z]+)/) { $name = $1; $val = ""; $string = $'; $name =~ tr/A-Z/a-z/; $attributes{$name} = $val;} else {last;}} %attributes;} sub substitute_entities { local ($string) = @_; $string =~ s/&/&/g; $string =~ s/</</g; $string =~ s/>/>/g; $string =~ s/"/\"/g; $string;} @@HTML_elements = ( "A", "ADDRESS", "B", "BASE", "BLINK", # Netscape addition :-( "BLOCKQUOTE", "BODY", "BR", "CITE", "CENTER", # Netscape addition :-( "CODE", "DD", "DIR", "DFN", "DL", "DT", "EM", "FORM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HR", "HTML", "I", "ISINDEX", "IMG", "INPUT", "KBD", "LI", "LINK", "MENU", "META", "NEXTID", "OL", "OPTION", "P", "PRE", "SAMP", "SELECT", "STRIKE", "STRONG", "TITLE", "TEXTAREA", "TT", "U", "UL", "VAR", ); sub define_element { local ($element) = @_; $Begin{$element} = "Noop"; $End{$element} = "Noop";} foreach $element (@HTML_elements) {&define_element($element);} # do nothing sub Noop { local ($element, $xxx) = @_;} # called when a tag begins. Dispatches using Begin sub html_begin { local ($element, $tag, *attributes) = @_; local ($routine) = $Begin{$element}; if ($routine eq "") { print STDERR "Unknown HTML element $element ($tag) on line $line_count\n";} else { &begin_font ($element, $explicit); eval "&$routine;"} } # called when a tag ends. Explicit is 0 if tag end is because of minimization # not that you should care. sub html_end { local ($element, $explicit) = @_; local ($routine) = $End{$element}; if ($routine eq "") { print STDERR "Unknown HTML element \"$element\" (END $explicit) on line $line_count\n";} else { eval "&$routine(\"$element\", $explicit)"; &end_font ($element, $explicit); } } sub html_content { local ($word) = @_; } sub html_whitespace { local ($whitespace) = @_;} sub html_comment { local ($tag) = @_;} # redefine these for application-specific initialization and cleanup # sub html_begin_doc {} # sub html_end_doc {} # return a "true value" when loaded by perl. 1; @EOF chmod 644 parse-html.pl echo x - html-ascii.pl sed 's/^@//' >html-ascii.pl <<'@EOF' # Routines for HTML to ASCII. # (fixed width font, no font changes for size, bold, etc) with a little # BUGS AND MISSING FEATURES # font tags (e.g. CODE, EM) cause an extra whitespace # e.g. <TT>foo</TT>, -> foo , # Jim Davis July 15 1994 # modified 3 Aug 94 to support MENU and DIR require "tformat.pl" || die "Could not load tformat.pl: $@\nStopped"; # Can be set by command line arg if (! defined($columns_per_line)) { $columns_per_line = 72;} if (! defined($flush_last_page)) { $flush_last_page = 1;} # amount to add to indentation $indent_left = 5; $indent_right = 5; # ignore contents inside HEAD. $ignore_text = 0; # Set variables in tformat $left_margin = 1; $right_margin = $columns_per_line; $bottom_margin = 0; ## Routines called by html.pl $Begin{"HEAD"} = "begin_head"; $End{"HEAD"} = "end_head"; sub begin_head { local ($element, $tag) = @_; $ignore_text = 1;} sub end_head { local ($element) = @_; $ignore_text = 0;} $Begin{"BODY"} = "begin_document"; sub begin_document { local ($element, $tag) = @_; &start_page();} $End{"BODY"} = "end_document"; sub end_document { local ($element) = @_; &fresh_line();} ## Headers $Begin{"H1"} = "begin_header"; $End{"H1"} = "end_header"; $Begin{"H2"} = "begin_header"; $End{"H2"} = "end_header"; $Begin{"H3"} = "begin_header"; $End{"H3"} = "end_header"; $Begin{"H4"} = "begin_header"; $End{"H4"} = "end_header"; $Skip_Before{"H1"} = 1; $Skip_After{"H1"} = 1; $Skip_Before{"H2"} = 1; $Skip_After{"H2"} = 1; $Skip_Before{"H3"} = 1; $Skip_After{"H3"} = 0; sub begin_header { local ($element, $tag) = @_; &skip_n_lines ($Skip_Before{$element}, 5);} sub end_header { local ($element) = @_; &skip_n_lines ($Skip_After{$element});} $Begin{"BR"} = "line_break"; sub line_break { local ($element, $tag) = @_; &fresh_line();} $Begin{"P"} = "begin_paragraph"; # if fewer than this many lines left on page, start new page $widow_cutoff = 5; sub begin_paragraph { local ($element, $tag) = @_; &skip_n_lines (1, $widow_cutoff);} $Begin{"BLOCKQUOTE"} = "begin_blockquote"; $End{"BLOCKQUOTE"} = "end_blockquote"; sub begin_blockquote { local ($element, $tag) = @_; $left_margin += $indent_left; $right_margin = $columns_per_line - $indent_right; &skip_n_lines (1);} sub end_blockquote { local ($element) = @_; $left_margin -= $indent_left; $right_margin = $columns_per_line; &skip_n_lines (1);} $Begin{"PRE"} = "begin_pre"; $End{"PRE"} = "end_pre"; sub begin_pre { local ($element, $tag) = @_; $whitespace_significant = 1;} sub end_pre { local ($element) = @_; $whitespace_significant = 0;} $Begin{"INPUT"} = "form_input"; sub form_input { local ($element, $tag, *attributes) = @_; if ($attributes{"value"} ne "") { &print_word_wrap($attributes{"value"});}} $Begin{"HR"} = "horizontal_rule"; sub horizontal_rule { local ($element, $tag) = @_; &fresh_line (); &print_n_chars ($right_margin - $left_margin, "-");} # Add code for IMG (use ALT attribute) # Ignore I, B, EM, TT, CODE (no font changes) ## List environments $Begin{"UL"} = "begin_itemize"; $End{"UL"} = "end_list_env"; $Begin{"OL"} = "begin_enumerated"; $End{"OL"} = "end_list_env"; $Begin{"MENU"} = "begin_menu"; $End{"MENU"} = "end_list_env"; $Begin{"DIR"} = "begin_dir"; $End{"DIR"} = "end_list_env"; $Begin{"LI"} = "begin_list_item"; @@list_stack = (); $list_type = "bullet"; $list_counter = 0; sub push_list_env { push (@list_stack, join (":", $list_type, $list_counter));} sub pop_list_env { ($list_type, $list_counter) = split (":", pop (@list_stack)); $left_margin -= $indent_left;} sub begin_itemize { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "bullet"; $list_counter = "*";} sub begin_menu { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "bullet"; $list_counter = "*";} sub begin_dir { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "bullet"; $list_counter = "*";} sub begin_enumerated { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "enumerated"; $list_counter = 1;} sub end_list_env { local ($element) = @_; &pop_list_env(); # &fresh_line(); } sub begin_list_item { local ($element, $tag) = @_; $left_margin -= 2; &fresh_line(); &print_word_wrap("$list_counter "); if ($list_type eq "enumerated") {$list_counter++;} $left_margin += 2;} $Begin{"DL"} = "begin_dl"; sub begin_dl { local ($element, $tag) = @_; &skip_n_lines(1,5);} $Begin{"DT"} = "begin_defined_term"; $Begin{"DD"} = "begin_defined_definition"; $End{"DD"} = "end_defined_definition"; sub begin_defined_term { local ($element, $tag) = @_; &fresh_line();} sub begin_defined_definition { local ($element, $tag) = @_; $left_margin += $indent_left; &fresh_line();} sub end_defined_definition { local ($element) = @_; $left_margin -= $indent_left; &fresh_line();} $Begin{"META"} = "begin_meta"; # a META tag sets a value in the assoc array %Variable # i.e. <META name="author" content="Rushdie"> sers $Variable{author} to "Rushdie" sub begin_meta { local ($element, $tag, *attributes) = @_; local ($variable, $value); $variable = $attributes{name}; $value = $attributes{content}; $Variable{$variable} = $value;} $Begin{"IMG"} = "begin_img"; sub begin_img { local ($element, $tag, *attributes) = @_; &print_word_wrap (($attributes{"alt"} ne "") ? $attributes{"alt"} : "[IMAGE]");} # Content and whitespace. sub html_content { local ($string) = @_; unless ($ignore_text) { &print_word_wrap ($string);}} sub html_whitespace { local ($string) = @_; if (! $whitespace_significant) { die "Internal error, called html_whitespace when whitespace was not significant";} local ($i); for ($i = 0; $i < length ($string); $i++) { &print_whitespace (substr($string,$i,1));}} # called by tformat. Do nothing. sub do_footer { } sub do_header { } 1; @EOF chmod 644 html-ascii.pl exit 0
file: /Techref/language/pcl/html2pcl.pl.htm, 18KB, , updated: 2001/3/30 08:53, local time: 2025/1/20 09:51,
3.145.9.174:LOG IN
|
©2025 These pages are served without commercial sponsorship. (No popup ads, etc...).Bandwidth abuse increases hosting cost forcing sponsorship or shutdown. This server aggressively defends against automated copying for any reason including offline viewing, duplication, etc... Please respect this requirement and DO NOT RIP THIS SITE. Questions? <A HREF="http://sxlist.com/techref/language/pcl/html2pcl.pl.htm"> language pcl html2pcl</A> |
Did you find what you needed? |
Welcome to sxlist.com!sales, advertizing, & kind contributors just like you! Please don't rip/copy (here's why Copies of the site on CD are available at minimal cost. |
Welcome to sxlist.com! |
.