#!/usr/bin/perl -CDSAL

use strict;
use warnings;
use utf8;

my $tagpartre= qr!(?:"[^"]*"|'[^']*'|[^"'>]+)*!;
my $anystartre= qr!<[\/\!]?\s*\w+\b$tagpartre!;
my $anytagre= qr!$anystartre>!;
my $keeptagre= qr!\b(?:html|head|title|body|b|i|u|s|em|strong|tt|pre|h\d|p|br|hr|ul|ol|li|dl|dt|dd|table|tr|th|td)\b!i;
my $deletenocontre= qr!\b(?:b|i|u|s|em|strong|tt)\b!i;
my $deletetagcontre= qr!\b(?:script|style)\b!i;

if( @ARGV && $ARGV[0] =~ /^--?h(?:elp)?$/i ) {
    print <<EOF;
usage: htmlsimplify.pl [ -a ] [ -i ] [ <file.html> ]
Removes tags unnecessary in textual HTML pages, and all tag attributes.  With
-a, <a ..> hyperlinks are kept; with -i, <img..> tags.  Retained tags without
content (currently only <br>) are transformed to XHTML.  The following tags are
retained:
$keeptagre
<meta> tags giving the charset
<em> is transformed to <i>, <strong> to <b>
EOF
    exit;
}

my $keephyperrefs= 0;
my $keepimgs= 0;
while( @ARGV ) {
    if( $ARGV[0] eq "-a" ) {
        $keephyperrefs= 1;
    }
    elsif( $ARGV[0] eq "-i" ) {
        $keepimgs= 1;
    }
    else {
        last;
    }
    shift @ARGV;
}

my $pendingdelete= "";
my $charset;

while( defined(my $line= <>) ) {
    my $content= "";
    while( $line =~ m!$anystartre$! ) {
        $content .= $line;
        $line= <>;
        $line= "" unless defined $line;
    }
    $content .= $line;
    if( $pendingdelete ) {
        next unless $content =~ s!^.*?</\s*$pendingdelete>!!is;
        $pendingdelete= "";
    }
    $content =~ s!\&nbsp;! !gi;
    $content =~ s!<\!--.*?-->!!g;
    $content =~ s!<($deletetagcontre)$tagpartre>.*?</\s*\1\s*>!!gi;
    if( $content =~ s!<($deletetagcontre)$tagpartre>.*$!! ) {
        $pendingdelete= $1;
    }
    my @tags;
    $content =~ s!($anytagre)!push(@tags, $1), "\x01"!goe;
    for (@tags) {
        if( m!^<\s*meta\b.*\bcharset\b!i ) {
            $charset= $1 if m!\bcharset="?([-\w.+:()]+)!i;
            # not used now; char class from iconv -l | chhist.pl
        }
        elsif( m!^<(/?)\s*(em|strong)\b!i ) {
            $_= $2 eq "em" ? "<${1}i>" : "<${1}b>";
        }
        elsif( m!^<(/?)\s*($keeptagre)! ) {
            $_= $2 eq "br" || $2 eq "hr" ? "<$2/>" : "<$1$2>";
        }
        elsif( $keephyperrefs && m!</?a\b!i ||
               $keepimgs && m!<img\b!i ) {
        }
        else {
            $_= "";
        }
    }
    $content =~ s!\x01!shift @tags!ge;
    while( $content =~ s!<($deletenocontre)></\1>!!ig ) {}
    print $content unless $content =~ /^\s*$/;
}

# note: possible to decode with Encode::decode(), encode as ASCII entities with
# HTML::Entities::encode_entities(), but need to set input stream to binary and
# can't just use <>

