#!/usr/bin/perl -w

#################################################################################
# 
# Web Secretary Ver 1.7.0
#
# Retrieves a list of web pages and send the pages via email to
# a designated recipient. It can optionally compare the page with a
# previously retrieved page, highlight the differences and send the
# modified page to the recipient instead.
#
# Copyright (C) 1998  Chew Wei Yih
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#################################################################################

my $feature_compress = 1;

use LWP::UserAgent;
eval { "load Compress::Zlib;" } or $feature_compress=0;
use POSIX qw(strftime);
use File::Spec;
use Getopt::Long;
use Pod::Usage;

# Print introduction
print "Web Secretary Ver 1.7.0\n";
print "By Chew Wei Yih Copyleft (c) 1998\n\n";

# Initialize parameters
if ( -e "url.list" ) {
    $base = ".";
}
else {
    $base = $ENV{HOME} . "/.websec";
}
$help = 0;
$man  = 0;

# Parse command line options
GetOptions(
    "help|?" => \$help,
    "man"    => \$man,
    "base=s" => \$base
);
pod2usage(1) if ($help);
pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;

# Remove trailing slash from base, we will add it ourself everywhere needed
$base =~ s/\/$//;

# Prepare pathnames.
$archive = "$base/archive";
mkdir $base,    0750 if !-d $base;
mkdir $archive, 0750 if !-d $archive;
$outgoing     = "$base/index.html";
$page_current = "$base/retrieve.html";

# Location of webdiff, if it's in the same directory as websec, use it,
# this enables simply opening the archive and using the program inplace.
use FindBin;
$webdiffbin = "$FindBin::Bin/webdiff";
if ( !-e $webdiffbin ) {
    $webdiffbin = "webdiff";
}

# prepare digest
@digest = ();

# Set default values
local %defaults = (
    URL        => "",
    Auth       => "none",
    Name       => "",
    Prefix     => "",
    Diff       => "webdiff",
    Hicolor    => "blue",
    AsciiMarker => 0,
    Ignore     => "none",
    IgnoreURL  => "none",
    Email      => "",
    EmailLink  => "",
    EmailError => 1,
    Program    => "",
    Proxy      => "",
    ProxyAuth  => "none",
    Randomwait => 0,
    Retry      => 3,
    Retrywait  => 0,
    Timeout    => 20,
    Tmin       => 0,
    Tmax       => 99999,
    AddSubject => "",
    Digest     => "false",
    UserAgent  => "WebSec/1.7",
    DateFMT    => " - %d %B %Y (%a)",
    MailFrom   => "",
);
%siteinfo = %defaults;

# Default return code
$rc = 0;

open ARGV, $base . "/url.list" unless exists $ARGV[0];

# Loop through input file and process all sites listed
while (<>) {
    chop $_;
    s/^\s*//;
    s/\s*$//;

    # Ignore comments
    next if (m/^#/);
    # Stop with a finish marker
    last if (m/^__END__/);

    # Handle non-empty lines
    if ( length != 0 ) {
        $rc = &HandleInput();
        if ( $rc != 0 ) { last; }
        next;
    }

    # Handle line separators
    $rc = &HandleSite();
    if ( $rc != 0 ) { last; }
    %siteinfo = %defaults;
}

# Process last site if available
if ( $rc == 0 && $siteinfo{URL} ne "" ) { $rc = &HandleSite(); }

# Delete temp files
unlink($outgoing);
unlink($page_current);

if (@digest) {
    $linkmsg =
      "The contents of the following URLs have changed:\n\n"
      . join ( "\n", @digest ) . "\n";
    $subj = "$addsubject$today";
    &MailMessage( $linkmsg, $subj, $digestEmail, $siteinfo{MailFrom} );
}

# End of main program
exit $rc;

# Handle setting of parameters
# Params: none
sub HandleInput() {

    # Get keyword, value pair
    ( $keyword, $value ) = split ( /=/, $_, 2 );
    $keyword =~ s/^\s*(.*?)\s*$/$1/;
    $value   =~ s/^\s*\"?(.*?)\"?\s*$/$1/;

    # Check if valid keyword
    if ( not defined $defaults{$keyword} ) {
        print qq(Unrecognized keyword in line $.: "$_". Keyword="$keyword".\n);
        return -1;
    }

    # Allow values from the environment
    while ($value =~ m/\${([^}]+)}/) {
        if (not exists $ENV{$1}) {
            print STDERR "Used environment variable '$1' but it is not defined, aborting.\n";
            exit 1;
        }
        $value =~ s/\${([^}]+)}/$ENV{$1}/;
    }

    $siteinfo{$keyword} = $value;
    return 0;
}

# Handle downloading, highlighting and mailing of each site.
# Params: none
# Returns: 0 => OK, -1 => Error
sub HandleSite() {

    # Get parameter values for this page
    $url        = $siteinfo{URL};
    $auth       = $siteinfo{Auth};
    $name       = $siteinfo{Name};
    $prefix     = $siteinfo{Prefix};
    $diff       = $siteinfo{Diff};
    $hicolor    = $siteinfo{Hicolor};
    $ignore     = $siteinfo{Ignore};
    $ignoreurl  = $siteinfo{IgnoreURL};
    $email      = $siteinfo{Email};
    $emailLink  = $siteinfo{EmailLink};
    $program    = $siteinfo{Program};
    $proxy      = $siteinfo{Proxy};
    $proxyAuth  = $siteinfo{ProxyAuth};
    $randomwait = $siteinfo{Randomwait};
    $retry      = $siteinfo{Retry};
    $retrywait  = $siteinfo{Retrywait};
    $timeout    = $siteinfo{Timeout};
    $tmin       = $siteinfo{Tmin};
    $tmax       = $siteinfo{Tmax};
    $addsubject = $siteinfo{AddSubject};
    $digest     = $siteinfo{Digest};
    $useragent  = $siteinfo{UserAgent};
    $datefmt    = $siteinfo{DateFMT};

    # Get today's date in the format we want.
    $today = strftime $datefmt, localtime;

    # If block without URL, assume parameter setting block and update default
    # values
    if ( $url eq "" ) {
        %defaults = %siteinfo;
        return 0;
    }

    # If essential parameters are not present, abort with error
    if ( $name eq ""
        || $prefix eq ""
        || ( $email eq "" && $emailLink eq "" && $program eq "" ) )
    {
        print "Name, prefix, program or email info missing from URL: $url.\n";
        return -1;
    }

    # Prepare for downloading this page
    print "Processing => $url ($name) ...\n";
    $pagebase            = "$archive/$prefix";
    $page_previous       = "$pagebase.html";
    $page_archive        = "$pagebase.old.html";
    $page_previousExists = 1;
    open( FILE, $page_previous ) or $page_previousExists = 0;
    close(FILE);
    $subj    = "$addsubject $name$today - $url";
    $webdiff =
"$webdiffbin --basedir=$base --archive=$page_previous --current=$page_current --out=$outgoing "
      . "--hicolor=$hicolor --ignore=$ignore --ignoreurl=$ignoreurl --tmin=$tmin --tmax=$tmax";

    if ($siteinfo{AsciiMarker}) {
        $webdiff .= " --asciimarker";
    }

    # Download URL using LWP
    $ua = new LWP::UserAgent;
    $ua->agent($useragent);
    $ua->timeout($timeout);
    $ua->env_proxy;
    if ( $proxy ne "" ) { $ua->proxy( http => $proxy ); }
    $req = new HTTP::Request( 'GET', $url );
    if ( $auth ne "none" ) {
        $req->authorization_basic( split ( /:/, $auth, 2 ) );
    }

    if ( $proxyAuth ne "none" ) {
        $req->proxy_authorization_basic( split ( /:/, $proxyAuth, 2 ) );
    }
    #$req->push_header("Accept" => "text/html, text/plain, text/*, */*");
    my $compress_options = "identity";
    if ($feature_compress) {
        $compress_options = "gzip, $compress_options";
    }
    $req->push_header("Accept-Encoding" => $compress_options);

    # Try up to '$retry' times to download URL
    $counter = 0;
    srand;
    while ( $counter < $retry ) {
        $resp = $ua->request($req);
        if ( $resp->is_success ) { last; }
        else {
            $counter++;
            if ( $randomwait > 0 ) {
                $random = int( rand $randomwait ) + 1;
                sleep $random;
            }
            else { sleep $retrywait; }
        }
    }

    # If URL is successfully downloaded
    if ( $resp->is_success ) {
        # Check if the data is gzip compressed, decompress if it is.
        if (($resp->content_encoding || "") =~ /gzip/) {
            my $new_content;

            if ($feature_compress) {
                $new_content = Compress::Zlib::memGunzip($resp->content);
            } else {
                $new_content = "Server sent gzip compressed data, and we are missing Compress::Gzip";
            }
            if (defined $new_content) {
                $resp->content($new_content);
                $resp->content_length(length $new_content);
                $resp->content_encoding("");
            }
        }
    
        open( HTML_FILE, ">$page_current" );
        print HTML_FILE "<!-- X-URL: ", $resp->base, " -->\n";
        print HTML_FILE "<BASE HREF= \"", $resp->base . "\">\n";
        print HTML_FILE $resp->content;
        close HTML_FILE;

        if ( $diff eq "webdiff" ) {
            if ( $page_previousExists == 1 ) {
                print
"Highlighting differences from previous version of webpage ...\n";
                $rc = system($webdiff);
                if ( $rc != 0 ) {
                    if ( $email ne "" ) {
                        print "Sending highlighted page to $email ...\n";
                        MailDocument( $outgoing, $subj, $email,
                            $siteinfo{MailFrom} );
                    }
                    if ( $emailLink ne "" ) {
                        print "Sending link to $emailLink ...\n";
                        if ( ( $digest ne "no" ) && ( $digest ne "false" ) ) {
                            push @digest, $url;
                            ($digestEmail) or ( $digestEmail = $emailLink );
                        }
                        else {
                            my $filepath = File::Spec->rel2abs($page_previous);
                            $linkmsg =
"The contents of the following URL has changed:\n$url\n\nIt can also be found at:\nfile://$filepath\n";
                            MailMessage(
                                $linkmsg,   $subj,
                                $emailLink, $siteinfo{MailFrom}
                            );
                        }
                    }
                    if ( $program ne "" ) {
                        ShowDocument( $program, $outgoing );
                    }
                }
                else {
                    print "No changes were detected.\n";
                }
                rename $page_previous, $page_archive;
                rename $page_current,  $page_previous;
            }
            else {
                print
                  "No previous version for this page. Storing in archive ...\n";
                rename $page_current, $page_previous;
            }
        }
        else {
            if ( $email ne "" ) {
                MailDocument( $page_current, $subj, $email,
                    $siteinfo{MailFrom} );
            }
            if ($page_previousExists) { rename $page_previous, $page_archive; }
            rename $page_current, $page_previous;
        }
    }

    # If unable to download URL
    else {
        print "Unable to retrieve page.\n";
        $errmsg =
          "Unable to retrieve $name ($url).\n\n"
          . "Detailed error as follows:\n"
          . $resp->error_as_HTML;

        if ( $email ne "" && $siteinfo{EmailError} ) {
            MailMessage( $errmsg, $subj, $email, $siteinfo{MailFrom} );
        }
        if ( $emailLink ne "" && $siteinfo{EmailError} ) {
            if ( ( $digest ne "no" ) && ( $digest ne "false" ) ) {
                push @digest, "Unable to retrieve: $url";
                ($digestEmail) or ( $digestEmail = $emailLink );
            }
            else {
                MailMessage( $errmsg, $subj, $emailLink, $siteinfo{MailFrom} );
            }
        }
    }

    return 0;
}

# Mail message
# Params: message, subject, recipient
# Returns: none
sub MailMessage() {
    my $message    = shift (@_);
    my $subject    = shift (@_);
    my @recipients = split /,/, shift (@_);
    my $from       = shift (@_);

    foreach $email (@recipients) {
        $req = HTTP::Request->new( POST => "mailto:" . $email );
        if ( $from ne "" ) {
            $req->header( "From",   $from );
            $req->header( "Sender", $from );
        }
        $req->header( "Subject",      $subject );
        $req->header( "Content-type", "text/plain; charset=us-ascii" );
        $req->header( "Content-Transfer-Encoding", "7bit" );
        $req->header( "MIME-Version",              "1.0" );
        $req->content($message);

        $ua = new LWP::UserAgent;
        $ua->request($req);
    }
}

# Mail HTML document.
# Params: filename, subject, recipient
# Returns: none
sub MailDocument() {
    my $filename   = shift (@_);
    my $subject    = shift (@_);
    my @recipients = split /,/, shift (@_);
    my $from       = shift (@_);
    my $tmpstr     = $/;

    undef $/;
    open( FILE, "$filename" ) or die "Cannot open $filename: $!\n";
    my $content = <FILE>;
    close(FILE);

    foreach $email (@recipients) {
        $req = HTTP::Request->new( POST => "mailto:" . $email );
        if ( $from ne "" ) {
            $req->header( "From",   $from );
            $req->header( "Sender", $from );
        }
        $req->header( "Subject",                   $subject );
        $req->header( "Content-type",              "text/html" );
        $req->header( "Content-Transfer-Encoding", "7bit" );
        $req->header( "MIME-Version",              "1.0" );
        $req->content($content);

        $ua = new LWP::UserAgent;
        $ua->request($req);
    }

    $/ = $tmpstr;
}

sub ShowDocument() {
    my ( $program, $outgoing ) = @_;
    my $status;

    # special handling for mozilla, try to use remoting...
    if ( $program eq "mozilla" ) {
        $status = system("mozilla -remote \"ping()\"");

        # print "Status after ping: ".$status."\n";

# if ping() returns ne 0, mozilla is not running, we cannot use openurl()
        if ( $status ne 0 ) {
            $status = system( "mozilla", $outgoing );
            if ( $status ne 0 ) {
                print "Running mozilla returned status: " . $status . "\n";
            }
        }
        else {
            $status =
              system(
                "mozilla -remote \"openurl(" . $outgoing . ",new-tab)\"" );
            if ( $status ne 0 ) {
                print "Running mozilla returned status: " . $status . "\n";
            }
        }
    }
    else {

        # other applications are currently just started
        $status = system( $program, $outgoing );
        if ( $status ne 0 ) {
            print "Application " . $program
              . " returned status: " . $status . "\n";
        }
    }
}

__END__

=head1 NAME

websec - Web Secretary

=head1 SYNOPSIS

websec [options]


=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=item B<--base>

Base directory for configuration (~/.websec by default)

=back

=head1 DESCRIPTION

B<websec> is a web page monitoring software.  It will send you a changed web
page with the contents highlighted.

The base directory is the place from which B<websec> will read the config files
and in which it will store its data.

When called without an argument, B<websec> will look for a base directory.
If the current directory has url.list it will use it, otherwise it will try to
use I<$HOME/.websec/>. You can also override this process with the I<--base>
option.

You can add a line like I<AddSubject = [websec]> to url.list, websec will add
I<[websec]> to every subject as a first word when mail is sent. You can then
easily detect this line by a mail filter.

The keywords I<Retry>, I<Retrywait>, and I<Timeout> in url.list lets you specify
the number of times to retry, time to wait between retries, and a timeout
setting.

B<Websec> waits for a random number of seconds between retries up to the value
specified by the I<Randomwait> keyword. This is to prevent websec from being
blocked by websites that perform log analysis to find time similarities between
requests.


=head1 SEE ALSO

/usr/share/doc/websec/README.gz, L<url.list(5)>, L<ignore.list(5)>, L<webdiff(1)>.


=head1 AUTHOR

Victor Chew is the original author of this software,
Baruch Even is continuing the maintenance and
Joop Stakenborg <pa3aba@debian.org> provided this man page, 

=cut

vim:set et ts=4:
