#!/usr/bin/perl -w
#
# sitescooper - download news from web sites and convert it automatically
#	 into one of several formats suitable for viewing on a Palm
#	 handheld.
#
# Skip down to read the POD documentation, or search for "=head1".
#
# To set up, search for "CUSTOMISE" -- note UK/Irish spelling. ;)
# Change the setting appropriately and uncomment it, if required.
# Then move the required sites from the "sites_off" directory into the
# "sites" directory, and those will be downloaded automatically next
# time you run the script.
#
# Sitescooper is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.  See the COPYRIGHT section in the POD
# documentation below, or the "doc/gpl.html" file in the distribution,
# for further details.

$main::VERSION = "2.1.1";

# NOTE: on Windows, you will need to use 2 backslashes in any paths in
# this built-in configuration file, to avoid Perl interpreting them,
# like so: C:\\TMP
#
$CONFIG = '
# UNIX users: you can use $HOME in this file to mean your home directory.
#

#######################################################################

# OPTIONAL SITE-DEPENDENT CONFIGURATION

# NOTE: If you will be converting sites into Pilot-readable format, you
# may need to specify this! The directory under your PalmPilot Desktop dir
# where installable PRC files need to go, or the application that should
# be run to install a PRC file.
#
# On UNIX platforms using pilot-xfer, you should set up a directory where
# installed files are to go before you run "pilot-xfer -i". Alternatively,
# several UNIX versions of pilot desktop are supported; place the name, as
# printed below, in the PilotInstallApp field to use them. These are
# PilotManager, gnome-pilot and JPilot.
#
# On a Win32 machine with only one Pilot user, this is determined
# automatically from the registry, so you will not need to set it.
# Otherwise, on Win32 platforms this is generally of the format
# {pilotdir}/{username}/Install, where {pilotdir} is the PalmPilot Desktop
# directory, and {username} is the abbreviation of the Pilot user name.
#
# MacOS users: you do need to customise this, a default will be used
# (although nothing will be written there). You need to run the conversion
# command yourself afterwards :(

# PilotInstallDir: $HOME/pilot/install		# CUSTOMISE
# PilotInstallApp: InstallApp			# CUSTOMISE

#######################################################################

# Sites directory, where the site description files are stored.
# By default, a directory called "sites" under your current directory,
# or under your $HOME on UNIX, is used if it exists.

# SitesDir: $HOME/lib/sites			# CUSTOMISE

#######################################################################

# Temporary directory to use for sitescooper. A subdirectory will be
# created called sitescooper{uid} where {uid} is your user id (or
# 0 on Win32 platforms). On UNIX platforms, this defaults to a hidden
# directory under your home dir, for privacy.

# TmpDir: /tmp					# CUSTOMISE

#######################################################################

# Specify the HTTP proxy server in use at your site, if applicable.

# ProxyHost: proxy.clubi.ie			# CUSTOMISE
# ProxyPort: 80					# CUSTOMISE

#######################################################################

# Diff will be searched for on the path if this is not specified here.
# If the word "MODULE" is specified, the Perl module Algorithm::Diff
# will be used instead of an external helper application (Note that
# Algorithm::Diff seems significantly slower though).

# Diff: C:\\path\\to\\diff.exe			# CUSTOMISE
# Diff: MODULE					# CUSTOMISE

#######################################################################

# The MakeDoc tool will be searched for on the path if it is
# not specified here. Default values are "makedoc" on UNIX platforms,
# or "makedocw.exe" on Windows.

# MakeDoc: makedocw.exe				# CUSTOMISE

# The iSilo conversion tool will be searched for on the path if it is
# not specified here. Default values are "iSilo386" for UNIX platforms,
# or "iSiloC32.exe" on Windows.

# iSilo: iSiloC32.exe				# CUSTOMISE

#######################################################################

# Where you want the text-format output to be saved. If commented,
# it will be saved under the sitescooper temporary directory.
# Note: MakeDocW on Windows does not like reading text from a
# directory with spaces in it, such as a directory under Program Files.
# In this case changing this parameter may help.

# TextSaveDir: C:\\where\\I\\want\\News		# CUSTOMISE

#######################################################################

# Where the "site_samples" directory can be found.
# This is the directory sitescooper was installed into. Generally
# sitescooper can work this out itself.

# SitescooperDir: /where/sitescooper/is/installed     # CUSTOMISE

#######################################################################

# If you want to share a cache between multiple users or multiple
# configurations of sitescooper, uncomment this. It will allow a
# shared cache to be used.

# SharedCacheDir: /home/jm/lib/scoop_cache	# CUSTOMISE

#######################################################################

# How long should a cached front page be considered valid?
# Specified in minutes.

# CachedPageLifetime: 60			# CUSTOMISE

#######################################################################

# How long should cached files be kept before expiring? Specified in
# days, and fractions are OK.

# ExpireCacheAfter: 7.0				# CUSTOMISE
';

#---------------------------------------------------------------------------

=head1 NAME

sitescooper - download news from web sites and convert it automatically
into one of several formats suitable for viewing on a Palm handheld.

=head1 SYNOPSIS

sitescooper [options] [ [-site sitename] ...]

sitescooper [options] [-sites sitename ...]

sitescooper [options] [-levels n] [-storyurl regexp] url [...]

Options: [-debug] [-refresh] [-config file] [-limit numkbytes]
[-install dir] [-instapp app] [-dump] [-dumpprc] [-nowrite] [-nodates]
[-quiet] [-admin cmd] [-nolinkrewrite] [-stdout-to file]
[-keep-tmps] [-noheaders] [-nofooters] [-fromcache]
[-filename template] [-prctitle template]
[-text | -html | -mhtml | -doc | -isilo | -misilo | -richreader
| -pipe fmt command]

=head1 DESCRIPTION

This script, in conjunction with its configuration file and its set of
B<site> files, will download news stories from several top news sites into
text format and/or onto your Palm handheld (with the aid of the
B<makedoc>/B<MakeDocW> or B<iSilo> utilities).

Alternatively URLs can be supplied on the command line, in which case
those URLs will be downloaded and converted using a reasonable set of
default settings.

HTTP and local files, using the C<file:///> protocol, are both supported.

Multiple types of sites are supported:

=over 4

1-level sites, where the text to be converted is all present on one page
(such as Slashdot, Linux Weekly News, BluesNews, NTKnow, Ars Technica);

2-level sites, where the text to be converted is linked to from a Table
of Contents page (such as Wired News, BBC News, and I, Cringely);

3-level sites, where the text to be converted is linked to from a Table
of Contents page, which in turned is linked to from a list of issues
page (such as PalmPower).

=back

In addition sites that post news as items on one big page, such as
Slashdot, Ars Technica, and BluesNews, are supported using diff.

Note that at this moment in time, the URLs-on-the-command-line invocation
format does not support 2- or 3-level sites.

The script is portable to most UNIX variants that support perl, as well
as the Win32 platform (tested with ActivePerl 5.00502 build 509).

Currently the configuration is stored as a string inside the script
itself, but an alternative configuration file can be specified with the
B<-config> switch.

The sites downloaded will be the ones listed in the site files you keep in
your F<sites> directory.

sitescooper maintains a cache in its temporary directory; files are kept
in this cache for a week at most. Ditto for the text output directory
(set with B<TextSaveDir> in the built-in configuration).

If a password is required for the site, and the current sitescooper session
is interactive, the user will be prompted for the username and password.
This authentication token will be saved for later use.  This way a site
that requires login can be set up as a .site -- just log in once, and your
password is saved for future non-interactive runs.

Note however that the encryption used to hide the password in the
sitescooper configuration is pretty transparent; I recommend that rather
than using your own username and password to log in to passworded sites, a
dedicated, sitescooper account is used instead.

=head1 OPTIONS

=over 4

=item -refresh

Refresh all links -- ignore the F<already_seen> file, do not diff pages,
and always fetch links, even if they are available in the cache.

=item -config file

Read the configuration from B<file> instead of using the built-in one.

=item -limit numkbytes

Set the limit for output file size to B<numkbytes> kilobytes, instead of
the default 200K.

=item -install dir

The directory to save PRC files to once they've been converted, in order
to have them installed to your Palm handheld.

=item -instapp app

The application to run to install PRC files onto your Palm, once they've
been converted.

=item -site sitename

Limit the run to the site named in the B<sitename> argument.  Normally all
available sites will be downloaded. To limit the run to 2 or more sites,
provide multiple B<-site> arguments like so:

	-site ntk.site -site tbtf.site

=item -sites sitename [...]

Limit the run to multiple sites; an easier way to specify multiple sites
than using the -site argument for each file.

=item -levels n

When specifying a URL on the command-line, this indicates how many levels
a site has. Not needed when using .site files.

=item -storyurl regexp

When specifying a URL on the command-line, this indicates the regular
expression which links to stories should conform to. Not needed when using
.site files.

=item -doc

Convert the page(s) downloaded into DOC format, with all the articles
listed in full, one after the other.

=item -text

Convert the page(s) downloaded into plain text format, with all the
articles listed in full, one after the other.

=item -html

Convert the page(s) downloaded into HTML format, on one big page, with
a table of contents (taken from the site if possible), followed by all
the articles one after another.

=item -mhtml

Convert the page(s) downloaded into HTML format, but retain the
multiple-page format. This will create the output in a directory
called B<site_name.pages>; in conjunction with the B<-dump> argument,
it will output the path of this directory on standard output before
exiting.

=item -isilo

Convert the page(s) downloaded into iSilo format (see
http://www.isilo.com/ ), on one big page.  This is the default.  The
page(s) will be displayed with a table of contents (taken from the site if
possible), followed by all the articles one after another.

=item -misilo

Convert the page(s) downloaded into iSilo format (see
http://www.isilo.com/ ), with one iSilo document per site, with each story
on a separate page.  The iSilo document will have a table-of-contents
page, taken from the site if possible, with each article on a separate
page.

=item -richreader

Convert the page(s) downloaded into RichReader format using HTML2Doc.exe
(see http://users.erols.com/arenakm/palm/RichReader.html ).  The page(s)
will be displayed with a table of contents (taken from the site if
possible), followed by all the articles one after another.

=item -pipe fmt command

Convert the page(s) downloaded into an arbitrary format, using the command
provided. Sitescooper will still rewrite the page(s) according to the
B<fmt> argument, which should be one of:

=over 4

=item text

Plain text format.

=item html

HTML in one big page.

=item mhtml

HTML in multiple pages.

=back

The command argument can contain C<__SCOOPFILE__>, which will be replaced
with the filename of the file containing the rewritten pages in the above
format, C<__SYNCFILE__>, which will be replaced with a suitable filename
in the Palm synchronization folder, and C<__TITLE__>, which will be
replaced by the title of the file (generally a string containing the date
and site name).

Note that for the B<-mhtml> switch, C<__SCOOPFILE__> will be replaced
with the name of the file containing the table-of-contents page. It's up
to the conversion utility to follow the href links to the other files
in that directory.

=item -dump

Output the page(s) downloaded directly to stdout in text or HTML format,
instead of writing them to files and converting each one. This option
implies B<-text>; to dump HTML, use B<-dump -html>.

=item -dumpprc

Output the page(s) downloaded directly to stdout, in converted format as a
PRC file, suitable for installation to a Palm handheld.

=item -nowrite

Test mode -- do not write to the cache or already_seen file, instead write
what would be written normally to a directory called new_cache and a
new_already_seen file. This is very handy when writing a new site file.

=item -debug

Enable debugging output. This output is in addition to the usual progress
messages.

=item -quiet

Process sites quietly, without printing the usual progress messages to
STDERR. Warnings about incorrect site files and system errors will still
be output, however.

=item -admin cmd

Perform an administrative command. This is intended to ease the task of
writing scripts which use sitescooper output.  The following admin
commands are available:

=over 4

=item dump-sites

List the sites which would be scooped on a scooping run, and their URLs.
Instead of scooping any sites, sitescooper will exit after performing this
task.  The format is one site per line, with the site file name first, a
tab, the site's URL, a tab, the site name, a tab, and the output filename
that would be generated without path or extension. For example:

S<foobar.site	http://www.foobar.com/	Foo Bar	1999_01_01_Foo_Bar>

=item journal

Write a journal with dumps of the documents as they pass through the
formatting and stripping steps of the scooping process. This is
written to a file called B<journal> in the sitescooper temporary
directory.

=item import-cookies file

Import a Netscape B<cookies> file into sitescooper, so that certain sites
which require them, can use them. For example, the site
B<economist_full.site> requires this. Here's how to import cookies on a
UNIX machine:

S<sitescooper.pl -admin import-cookies ~/.netscape/cookies>

and on Windows:

S<perl sitescooper.pl -admin import-cookies
  "C:\Program Files\Netscape\Users\Default\cookies.txt">

Unfortunately, MS Internet Explorer cookies are currently unsupported.
If you wish to write a patch to support them, that'd be great.

=back

=item -nolinkrewrite

Do not rewrite links on scooped documents -- leave them exactly as they
are.

=item -noheaders

Do not attach the sitescooper header (URL, site name, and navigation
links) to each page.

=item -nofooters

Do not attach the sitescooper footer ("copyright retained by original
authors" blurb) to each page.

=item -fromcache

Do not perform any network access, retrieve everything from the cache or
the shared cache.

=item -filename template

Change the format of output filenames. B<template> contains the following
keyword strings, which are substituted as follows:

=over 4

=item YYYY

The current year, in 4-digit format.

=item MM

The current month number (from 01 to 12), in 2-digit format.

=item Mon

The current month name (from Jan to Dec), in 3-letter format.

=item DD

The current day of the month (from 01 to 31), in 2-digit format.

=item Day

The current day of the week (from Sun to Sat), in 3-letter format.

=item hh

The current hour (from 00 to 23), in 2-digit format.

=item mm

The current minute (from 00 to 59), in 2-digit format.

=item Site

The current site's name.

=item Section

The section of the current site (where used). (Most sites do not use
sections anymore, but it doesn't hurt to support them!)

=back

The default filename template is B<YYYY_MM_DD_Site_Section>.

=item -prctitle template

Change the format of the titles of the resulting PRC files. B<template>
may contain the same keyword strings as B<-filename>.

The default PRC title template is B<YYYY-Mon-DD: Site Section>.

=item -nodates

Do not put the date in the installable file's filename. This allows you to
automatically overwrite old files with new ones when you HotSync. It's
a compatibility shortcut for B<-filename Site_Section -prctitle "Site
Section">.

=item -stdout-to file

Redirect the output of sitescooper into the named file. This is needed on
Windows NT and 95, where certain combinations of perl and Windows do not
seem to support the &gt; operator.

=item -keep-tmps

Keep temporary files after conversion. Normally the .txt or .html
rendition of a site is deleted after conversion; this option keeps
it around.

=back

=head1 INSTALLATION

To install, edit the script and change the #! line. You may also need to
(a) change the Pilot install dir if you plan to use the pilot installation
functionality, and (b) edit the other parameters marked with CUSTOMISE in
case they need to be customised for your site. They should be set to
acceptable defaults (unless I forgot to comment out the proxy server lines
I use ;).

=head1 EXAMPLES

	sitescooper.pl http://www.ntk.net/

To snarf the ever-cutting NTKnow newsletter.

=head1 ENVIRONMENT

B<sitescooper> makes use of the C<$http_proxy> environment variable, if it
is set.

=head1 AUTHOR

Justin Mason E<lt>jm /at/ jmason.orgE<gt>

=head1 COPYRIGHT

Copyright (C) 1999 Justin Mason

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc., 59
Temple Place - Suite 330, Boston, MA  02111-1307, USA, or read it on
the web at http://www.gnu.org/copyleft/gpl.html .

=head1 SCRIPT CATEGORIES

The CPAN script category for this script is C<Web>. See
http://www.cpan.org/scripts/ .

=head1 PREREQUISITES

C<File::Find>
C<File::Path>
C<URI::URL>
C<LWP::UserAgent>
C<HTTP::Request::Common>
C<HTTP::Date>
C<HTML::Entities>

All these can be picked up from CPAN at http://www.cpan.org/ .  Note that
C<HTML::Entities> is actually included in one of the previous packages, so
you do not need to install it separately.

=head1 COREQUISITES

C<Win32::TieRegistry>, if running on a Win32 platform, to find the Pilot
Desktop software's installation directory. C<Algorithm::Diff> to support
diffing sites without running an external B<diff> application (this is required
on Mac systems).

=head1 README

Sitescooper downloads news stories from the web and converts them to Palm
handheld iSilo, DOC or text format for later reading on-the-move.  Site
files and full documentation can be found at
http://sitescooper.tsx.org/ .

=cut

#---------------------------------------------------------------------------

sub usage { die <<__ENDOFUSAGE;

Sitescooper - download news from web sites and convert it automatically
	into one of several formats suitable for viewing on a Palm
	handheld.

sitescooper [options] [ [-site sitename] ...]

sitescooper [options] [-sites sitename ...]

sitescooper [options] [-levels n] [-storyurl regexp] url [...]

Options: [-debug] [-refresh] [-config file] [-limit numkbytes]
	[-install dir] [-instapp app] [-dump] [-dumpprc] [-nowrite]
	[-nodates] [-quiet] [-admin cmd] [-nolinkrewrite] [-stdout-to file]
	[-keep-tmps] [-noheaders] [-nofooters] [-fromcache]
	[-filename template] [-prctitle template]
	[-text | -html | -mhtml | -doc | -isilo | -misilo | -richreader
	| -pipe fmt command]

Both file:// and http:// URLs are supported.

Version: $main::VERSION
__ENDOFUSAGE
					    }

#---------------------------------------------------------------------------

# use Carp;
# use strict;	# ah shaggit, life's too short for strict
use File::Find;
use File::Path;
use File::Basename;
use Cwd;

use LWP::UserAgent;
use URI::URL;
use HTTP::Date;
use HTTP::Cookies;
use HTTP::Request::Common;
use HTML::Entities;
use HTML::Parser;
use HTML::Filter;

#CGI use CGI;

if (&Portability::MyOS eq 'Win32') {
  eval 'use Win32::TieRegistry( Delimiter=>"/", ArrayValues=>0 );';
}

$SIG{__WARN__} = 'warn_log';
$SIG{__DIE__} = 'die_log';

$OUT_TEXT = 0;
$OUT_DOC = 1;
$OUT_HTML = 2;

$main::home_url = "http://sitescooper.tsx.org";
$main::refresh = 0;
$main::debug = 0;
$main::debugdiffs = 0;			# set to 1 to break after diffing

#$main::just_caching = 0;
$main::cached_front_page_lifetime = 60;	# in minutes
$main::dump = 0;
$main::dumpprc = 0;

$main::verbose = 1;
$main::nowrite = 0;
$main::bookmark_char = "\x8D";		# yes, same as Chris' one, cheers!
undef $main::pilotinstdir;
undef $main::pilotinstapp;
$main::cgimode = 0;

# Text is the default on MacOS.
if (&Portability::MyOS eq 'Mac') {
  $main::outstyle = $OUT_TEXT;
  $main::outputfilter = '__cat__';
  $main::use_convert_tool = 0;
} else {
  $main::outstyle = $OUT_HTML;
  $main::outputfilter = 'isilo';
  $main::use_convert_tool = 1;
}

$main::fileperpage = 0;
$main::nolinkrewrite = 0;
$main::filesizelimit = 200;		# limit of Kb per file (uncompressed)
$main::writeheader = 1;
$main::writefooter = 1;
$main::allowimgs = 1;

$main::use_only_cache = 0;
#CGI $main::cgi = undef;
$main::admincmd = undef;
@main::sites_restrict_to = ();
@main::sites_additional = ();
@main::layout_site_files = ();
@main::cmdline_urls = ();
%main::caches_to_rename = ();
$main::keep_tmps = 0;
$main::use_hashes_for_cache_filenames = 0;

$main::argv_levels = undef;
$main::argv_storyurl = undef;
$main::expiry_days = 7.0;

$main::add_closing_tags = 1;
$main::strip_empty_tag_sets = 0; 

$filename_template = "YYYY_MM_DD_Site_Section";
$prc_title = "YYYY-Mon-DD: Site Section";

$main::useragent = new ScoopHTTP::UserAgent;
$main::useragent->env_proxy;
$main::useragent->agent ("sitescooper/$main::VERSION ($main::home_url) ".
		$main::useragent->agent);
$main::useragent->max_size (1024*1024*2);	# 2-meg file limit

$main::cookie_jar = HTTP::Cookies::Netscape->new;

&clear_page_tmpfiles;

# --------------------------------------------------------------------------

if (defined $ENV{'REQUEST_METHOD'}) {
  # we're running from a CGI script, use CGI mode
  #$main::cgimode = 1;
  #$main::cgi = new CGI;
}

# This is the placeholder for development debug flags.
# Add debugging stuff here, tagged with J M D (without the spaces ;).


# --------------------------------------------------------------------------

if ($main::cgimode == 0) {
  while ($#ARGV >= 0) {
    $_ = shift;

    if (/^-debug$/) {
      $main::debug = 1;

    } elsif (/^-quiet$/) {
      $main::verbose = 0;

    } elsif (/^-refresh/) {
      $main::cached_front_page_lifetime = 0;
      $main::refresh = 1;

    #} elsif (/^-cache/) {
      #$main::just_caching = 1;	# used for future parallelism

    } elsif (/^-dump/) {
      $main::dump = 1;
      $main::outstyle = $OUT_TEXT;
      $main::use_convert_tool = 0;

    } elsif (/^-dumpprc/) {
      $main::dumpprc = 1;
      $main::use_convert_tool = 1;

    } elsif (/^-doc/) {
      $main::outstyle = $OUT_DOC;
      $main::fileperpage = 0;
      $main::outputfilter = 'makedoc';
      $main::use_convert_tool = 1;

    } elsif (/^-isilo/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 0;
      $main::outputfilter = 'isilo';
      $main::use_convert_tool = 1;

    } elsif (/^-misilo/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 1;
      $main::outputfilter = 'isilo';
      $main::use_convert_tool = 1;

    } elsif (/^-richreader/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 0;
      $main::outputfilter = 'richreader';
      $main::use_convert_tool = 1;

    } elsif (/^-text/) {
      $main::outstyle = $OUT_TEXT;
      $main::fileperpage = 0;
      $main::outputfilter = '__cat__';
      $main::use_convert_tool = 0;

    } elsif (/^-html/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 0;
      $main::outputfilter = '__cat__';
      $main::use_convert_tool = 0;

    } elsif (/^-mhtml/) {
      $main::outstyle = $OUT_HTML;
      $main::fileperpage = 1;
      $main::outputfilter = '__cat__';
      $main::use_convert_tool = 0;

    } elsif (/^-pipe/) {
      my $fmt = shift;
      my $cmd = shift;

      if ($fmt eq 'text') {
	$main::outstyle = $OUT_TEXT;
	$main::fileperpage = 0;

      } elsif ($fmt eq 'html') {
	$main::outstyle = $OUT_HTML;
	$main::fileperpage = 0;

      } elsif ($fmt eq 'mhtml') {
	$main::outstyle = $OUT_HTML;
	$main::fileperpage = 1;
      } else {
	&usage;
      }
      $main::outputfilter = 'cmd: '.$cmd;
      $main::use_convert_tool = 1;

    } elsif (/^-admin$/) {
      $main::admincmd = shift;
      if ($main::admincmd eq 'import-cookies') { $importcookies = shift; }

    } elsif (/^-nolinkrewrite/) {
      $main::nolinkrewrite = 1;
    } elsif (/^-fromcache/) {
      $main::use_only_cache = 1;
    } elsif (/^-limit/) {
      $main::filesizelimit = shift(@ARGV)+0;
    } elsif (/^-nodates/) {
      $main::filename_template = 'Site_Section';
      $main::prc_title = 'Site Section';
    } elsif (/^-nowrite/) {
      $main::nowrite = 1;
    } elsif (/^-config/) {
      $config = shift;
    } elsif (/^-install/) {
      $pilotinstdir = shift;
    } elsif (/^-instapp/) {
      $pilotinstapp = shift;
    } elsif (/^-site$/) {
      push (@sites_restrict_to, shift);
    } elsif (/^-sites/) {
      push (@sites_restrict_to, @ARGV); @ARGV = (); last;
    } elsif (/^-levels/) {
      $argv_levels = shift()+0;
    } elsif (/^-storyurl/) {
      $argv_storyurl = shift;
    } elsif (/^-keep-tmps/) {
      $main::keep_tmps = 1;

    } elsif (/^-noheaders/) {
      $main::writeheader = 0;
    } elsif (/^-nofooters/) {
      $main::writefooter = 0;

    } elsif (/^-filename/) {
      $main::filename_template = shift;
    } elsif (/^-prctitle/) {
      $main::prc_title = shift;

    } elsif (/^-stdout-to/) {
      $_ = shift; close (STDOUT);
      open (STDOUT, ">> ".$_) or die "failed to redirect STDOUT to $_\n";

    } elsif (/^-/) {
      &usage;
    } else {
      unshift @ARGV, $_; last;
    }
  }
  @main::cmdline_urls = @ARGV;
  $main::userid = $<;

} else {
  # load some things from CGI parameters
  #CGI@main::cmdline_urls = ($main::cgi->param ('url'));
  #CGI$main::argv_levels = $main::cgi->param ('levels');
  #CGI$main::argv_storyurl = $main::cgi->param ('storyurl');

  #CGI@main::sites_restrict_to = $main::cgi->param ('sites');

  #CGI$main::debug = $main::cgi->param ('debug');
  #CGI$main::outstyle = $main::cgi->param ('outstyle');
  #CGI$main::nowrite = $main::cgi->param ('nowrite');
  #CGI$main::refresh = $main::cgi->param ('refresh');
  #CGI$main::userid = $main::cgi->param ('userid');
  #CGI&ScoopCGI::get_cookie;
  #CGI $main::password = $main::cgi->param ('password');
  # REVISIT -- use a cookie to store userid and password

  #CGI$main::pilotinstdir = undef;
}

&main::verbose ("Sitescooper version ".$main::VERSION
				.", Copyright (c) 1999 Justin Mason\n"
	."Sitescooper comes with ABSOLUTELY NO WARRANTY; for details\n"
	."see http://jmason.org/software/sitescooper/doc/gpl.html .\n");

@conf = ();
@conflines = ();

# UNIX platforms: create a default configuration file. This makes
# it easier to support rpm installation. Also read from /etc/sitescooper.cf
# if no -config argument is supplied.
#
if (!defined $config) {
  if (&Portability::MyOS eq 'UNIX') {
    $tmpdir = $ENV{'HOME'}."/.sitescooper";
    $config = "$tmpdir/sitescooper.cf";

    if (!-r $config) {
      (-d $tmpdir) or
	  mkdir ($tmpdir, 0777) or die "failed to mkdir '$tmpdir'\n";

      warn "Copying default config to \"$config\".\n".
      	"Edit this if you need to change any configuration settings.\n\n";

      open (OUT, ">$config") or die "cannot write to $config\n";
      print OUT $CONFIG; close OUT;
    }

    if (-r "/etc/sitescooper.cf") {
      $globalcf = "/etc/sitescooper.cf";
      open (IN, "< $globalcf") || die "cannot read $globalcf\n";
      @conf = (<IN>); close IN;
      for ($i=0; $i<$#conf+1; $i++) { push (@conflines, "$globalcf:".($i+1)); }
    }
  }
}

if (defined $config) {
  &verbose ("Reading configuration from \"$config\".");
  open (IN, "< $config") || die "cannot read $config\n";
  @conf = (<IN>); close IN;
  for ($i=0; $i<$#conf+1; $i++) { push (@conflines, "$config:".($i+1)); }

} else {
  &verbose ("Using built-in configuration.");
  @conf = split(/\n/, $CONFIG);
  for ($i=0; $i<$#conf+1; $i++) { push (@conflines, "(built-in):".($i+1)); }
}

if ($main::debugdiffs) {
  &main::dbg ("debugging, will exit after diff");
}

# --------------------------------------------------------------------------

# Andrew Fletcher <fletch@computer.org>:
# A relative path on Mac seems to need a ":" before it. I've called
# this $colon.
$colon = '';
$slash = '/';

if (&Portability::MyOS eq 'Win32') { $slash = '\\'; }

if (&Portability::MyOS eq 'Mac') {
  $slash = ':'; $colon = ':';

  # because of the Mac's 32-char filename limitation, we need to include
  # a hash of the URL in cache filenames to avoid clashes. This may be
  # handy for other OSes too, but leave it Mac-only for now.
  $main::use_hashes_for_cache_filenames = 1;
}

$outdir = '';
%site_format = ();
%links_start = %links_end = ();
%links_limit_to = %story_limit_to = ();
%links_print = ();
%links_trim = ();
%story_skip = %links_skip = ();
%story_diff = %links_diff = ();
%links_follow_links = %story_follow_links = ();
%story_lifetime = ();
%story_postproc = ();
%story_preproc = ();
%url_postproc = ();
%cacheable = ();	# 0 = static, 1 = dynamic, undef = use heuristics
%printable_sub = ();
%use_alt_tags = ();
%head_pat = ();
%levels = ();
%use_table_smarts = ();
%extra_urls = ();
@sites = ();
@layouts = ();
@exceptions = ();
@unsorted_layouts = ();
@unsorted_exceptions = ();

$url = '';
$sect = '';
$curkey = '';
$main::cached_front_page_lifetime /= (24*60);	# convert to days
%url_title = ();
$sharedcache = undef;

# this should handle all OSes OK.
$sitescooperdir = dirname($0);

undef $tmpdir;
if (&Portability::MyOS eq 'UNIX') {
  $tmpdir = $ENV{'HOME'}."/.sitescooper";
}
$tmpdir ||= $ENV{'TMPDIR'};
$tmpdir ||= $ENV{'TEMP'};
if (!defined $tmpdir && &Portability::MyOS eq 'Win32' && defined $ENV{'WINDIR'})
{
  $tmpdir = $ENV{'WINDIR'}."\\Temp";
}

$diff = 'diff';
if (&Portability::MyOS eq 'Win32') { $diff = "diff.exe"; }
if (&Portability::MyOS eq 'Mac') { $diff = ""; }	# use Algorithm::Diff

$makedoc = 'makedoc';
if (&Portability::MyOS eq 'Win32') { $makedoc = "makedocw.exe"; }

$isilo = 'iSilo386'; $isiloargs = '-y'; $isilomultipageargs = '-d9';
if ($main::allowimgs) { $isiloargs .= ' -Is300 -Ic -Id'; }

if (&Portability::MyOS eq 'Win32') { $isilo = "iSiloC32.exe"; }

$richreader = 'HTML2Doc'; $richargs = '';
if (&Portability::MyOS eq 'Win32')
	{ $richreader = "HTML2Doc.exe"; $richargs = '-i'; }
# Note that currently there is no HTML2Doc for UNIX platforms; it's
# supported here anyway for future-proofing.

# ---------------------------------------------------------------------------

sub set_got_intr_behaviour {
  $got_intr_behaviour = shift;
  $got_intr_flag = 0;
}

sub got_intr {
  my $signame = shift;
  (&Portability::MyOS eq 'UNIX') and system ("stty echo");

  if ($got_intr_behaviour eq 'exit') {
    die "got signal SIG$signame, exiting.\n";
  } else {
    die "got signal SIG$signame, skipping site...\n";
    $got_intr_flag = 1;
  }
}

&set_got_intr_behaviour ('exit');
$SIG{'INT'} = \&got_intr;
$SIG{'TERM'} = \&got_intr;

# ---------------------------------------------------------------------------

$proxyhost = undef;
$proxyport = 80;
$confline = undef;

&ParseConfig;

# ---------------------------------------------------------------------------

if (!defined $pilotinstdir && !defined $pilotinstapp && !$main::cgimode) {
  @main::possible_inst_dirs = ();
  my $dir;

  if (&Portability::MyOS eq 'Win32') {
    eval '
      sub get_instdir_wanted {
	return unless (/^install$/i && -d $File::Find::name);
	push (@main::possible_inst_dirs, $File::Find::name);
      }

      my $key = "HKEY_CURRENT_USER/Software/U.S. Robotics".
		    "/Pilot Desktop/Core//Path";
      if ($dir = $Registry->{$key}) {
	@main::possible_inst_dirs = ();
	find(\&get_instdir_wanted, $dir);
      }
    1;' or die "eval failed: $@";

  } elsif (&Portability::MyOS eq 'UNIX') {
    $pilot_mgr_dir = $ENV{'HOME'}."/.pilotmgr/Installer";
    $jpilot_file = $ENV{'HOME'}."/.jpilot/jpilot_to_install";

    @main::possible_inst_dirs = ();

    if (defined $pilotinstapp) {
      # see if one of the built-in support for UNIX pilot desktops is
      # being used.
      #
      if ($pilotinstapp =~ /pilot.*manager/i)
	{ @main::possible_inst_dirs = ($pilot_mgr_dir); undef $pilotinstapp; }
      elsif ($pilotinstapp =~ /gnome.*pilot/i)
	{ $pilotinstapp = "gpilot-install-file --later"; }
      elsif ($pilotinstapp =~ /jpilot/i)
	{ $pilotinstapp = "***ADD_TO_MANIFEST*** ".$jpilot_file; }

    } elsif ($#main::possible_inst_dirs <= 0 && defined $ENV{'HOME'}) {
      # see if the pilot-xfer directories are used.
      #
      $dir = $ENV{'HOME'}."/pilot";
      if (-d "$dir/install")
	{ @main::possible_inst_dirs = ("$dir/install"); }
      elsif (-d $dir)
	{ @main::possible_inst_dirs = ($dir); }
      else
	{
	# that didn't work, the dirs don't exist. Try to autodetect if one
	# of the pilot-desktop tools has been used.
	#
	if (-d $pilot_mgr_dir) {
	  @main::possible_inst_dirs = ($pilot_mgr_dir);
	  &verbose ("Using PilotManager install directory, \"$pilot_mgr_dir\".\n".
	    "Please edit your configuration if this is incorrect.");

	} elsif (`gnome-pilot-config --version 2>&1` =~ /^gnome-pilot \d/) {
	  $pilotinstapp = "gpilot-install-file --later";
	  &verbose ("Using gnome-pilot install tool, \"$pilotinstapp\".\n".
	    "Please edit your configuration if this is incorrect.");

	} elsif (-f $jpilot_file) {
	  $pilotinstapp = "***ADD_TO_MANIFEST*** ".$jpilot_file;
	  &verbose ("Using JPilot install list, \"$jpilot_file\".\n".
	    "Please edit your configuration if this is incorrect.");
	}
      }
    }
  }

  if ($#main::possible_inst_dirs == 0) {
    $pilotinstdir = $main::possible_inst_dirs[0];

  } elsif (!defined $pilotinstapp && $#main::possible_inst_dirs > 0 && !&writing_text) {
    warn "Fatal: too many potential pilot PRC install directories, ".
	"please use '-install' argument.\n";
    foreach $dir (@main::possible_inst_dirs) {
      warn "Possible choice: $dir\n";
    }
    &cleanexit(1);
  }
}

# ---------------------------------------------------------------------------

if (!defined $tmpdir) {
  warn "Warning: cannot work out TmpDir, please set it manually\n".
  	"in the configuration section of the script.\n";
}

if (!defined $sitescooperdir) {
  warn "Warning: cannot work out SitescooperDir, please set it manually\n".
  	"in the configuration section of the script.\n";
}

# we may not have read the sites directory by this stage.
if (!defined $sitesdir) {
  &main::dbg ("SitesDir was not specified, trying to guess it...");

  my $possible = "sites";
  if (!defined $sitesdir && -d $possible) { $sitesdir = $possible; }

  if (&Portability::MyOS eq 'UNIX') {
    $possible = $ENV{'HOME'}."/sites";
    if (!defined $sitesdir && -d $possible) { $sitesdir = $possible; }

    $possible = $tmpdir."/sites";
    if (!defined $sitesdir && -d $possible) { $sitesdir = $possible; }
  }

  $possible = $sitescooperdir."${slash}sites";
  if (!defined $sitesdir && -d $possible) { $sitesdir = $possible; }
}

&ReadSiteChoices;

if (!defined $sitesdir) {
  warn "Warning: can't find the 'sites' directory, please set SitesDir!\n";
} else {
  &ReadSitesDir;
  &ParseConfig;
}

# ---------------------------------------------------------------------------

if (defined $proxyhost) {
  $main::useragent->proxy
  	(['http', 'ftp'], "http://$proxyhost:$proxyport/");
}

if ($diff eq 'MODULE') {
  eval 'use Algorithm::Diff qw(diff); 1;'
  		or die "Cannot use built-in diff support, perl module\n".
			"Algorithm::Diff not found: $@\n";
}

@layouts = ();
foreach $pat (sort { length($a) <=> length($b) } @unsorted_layouts)
{
  $active{$pat} = 0;		# ensure they arent treated like sites
  push (@layouts, $pat);
}
undef @unsorted_layouts;

&main::dbg ("site layouts defined: ", join(' ', @layouts));

@exceptions = ();
foreach $pat (sort { length($a) <=> length($b) } @unsorted_exceptions)
{
  $active{$pat} = 0;		# ensure they arent treated like sites
  push (@exceptions, $pat);
}
undef @unsorted_exceptions;

# ---------------------------------------------------------------------------

# if ($just_caching) {
#   # just put the pages into the cache and forget about it
#   foreach $url (@main::cmdline_urls) {
#     &log ("bg: getting $url ...");
#     &get_page ($url, $url, 0);
#   }
#   &log ("bg: done.");
#   &cleanexit;
# }

if ($#main::cmdline_urls > -1) {
  @sites = ();
  foreach $url (@main::cmdline_urls) {
    # if it's a local file URL, switch around the slashes (for windows)
    if (&Portability::MyOS eq 'Win32' && $url =~ m,file:///,i) {
      $url =~ s/\\/\//g;
    }
    # REVISIT -- I don't know what to do in the same case for MacOS ;)

    if (-r $url) {
      if ($url =~ m,^/,) {
	$url = 'file://'.$url;
      } else {
	$url = "file://".getcwd."/".$url;
      }
    }

    if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

    if (!defined $name{$url}) {
      $name{$url} = $url;
      if ($url =~ m,/([^/]+)$,) {
	$_ = $1;
	if (length ($_) > 40) {
	  # trim out spare stuff to keep it short.
	  s,^([^:]+://[^/]+)/.*/([^/]+$),$1/.../$2,i;
	  $name{$url} = $_;
	} else {
	  $name{$url} = $_;
	}
      }
    }
    $confline = "$url:0";

    push (@sites, $url);
    &SetDefaultConfigForURL ($url);
    $site_defined_at{$url} = $confline;
    $story_lifetime{$url} = 0;	# any age of story for command-line URLs

    if (defined $argv_levels) {
      $levels{$url} = $argv_levels-2;
    }
    if (defined $argv_storyurl) {
      $story_limit_to{$url} = $argv_storyurl;
    }
  }
}

# ---------------------------------------------------------------------------

($mday, $mon, $year, $monstr) = &get_date;
($min, $hr, $wdaystr) = &get_extra_date;

$filename_template =~ s/YYYY/ sprintf ("%04d", $year); /eg;
$filename_template =~ s/MM/ sprintf ("%02d", $mon); /eg;
$filename_template =~ s/Mon/ sprintf ("%3s", $monstr); /eg;
$filename_template =~ s/DD/ sprintf ("%02d", $mday); /eg;
$filename_template =~ s/Day/ sprintf ("%3s", $wdaystr); /eg;
$filename_template =~ s/hh/ sprintf ("%02d", $hr); /eg;
$filename_template =~ s/mm/ sprintf ("%02d", $min); /eg;

$prc_title =~ s/YYYY/ sprintf ("%04d", $year); /eg;
$prc_title =~ s/MM/ sprintf ("%02d", $mon); /eg;
$prc_title =~ s/Mon/ sprintf ("%3s", $monstr); /eg;
$prc_title =~ s/DD/ sprintf ("%02d", $mday); /eg;
$prc_title =~ s/Day/ sprintf ("%3s", $wdaystr); /eg;
$prc_title =~ s/hh/ sprintf ("%02d", $hr); /eg;
$prc_title =~ s/mm/ sprintf ("%02d", $min); /eg;

%already_seen = ();
%last_modtime = ();
%main::oldest_already_seen = ();
@seen_this_time = ();
$main::failed_to_cvt = 0;
$main::warn_about_external_links = 0;	# turned on where necessary

&make_dirs;
&generate_output_filenames (@sites);

if (defined $main::admincmd) {
  if ($main::admincmd eq 'dump-sites') {
    while (($key,$outfile) = each %key2outfile) {
      my $url = $key2url{$key};
      my $title = $key2title{$key};
      $title =~ s,\t, ,g; $title =~ s,^\d+-\S+-\d+: ,,g;
      my $base = $key2tmp{$key}; $base =~ s,^.*${slash}(\S+?)\.tmp$,$1,o;
      my $site = $site_defined_at{$url};
      $site =~ s/:\d+$//; $site =~ s/^.*${slash}(\S+?)$/$1/o;

      # foobar.site	http://www.foobar.com/	Foo Bar	1999_01_01_Foo_Bar
      print "$site\t$url\t$title\t$base\n";
    }
    exit;

  } elsif ($main::admincmd eq 'journal') {
    open (JOURNAL, "> $tmpdir${slash}journal")
    	or die "cannot write to $tmpdir${slash}journal!\n";

  } elsif ($main::admincmd eq 'import-cookies') {
    warn "Importing Netscape-format cookie jar from \"$importcookies\"...\n";
    $main::cookie_jar->load ($importcookies);
    warn "Cookie jar now looks like:\n".$main::cookie_jar->as_string;
    @sites = ();
    @main::cmdline_urls = ();
    @main::filekeys = ();
    # and carry on to exit.

  } else { &usage; }
}

if ($main::use_convert_tool) {
  if (defined $pilotinstdir && !-d $pilotinstdir) {
    mkdir ($pilotinstdir, 0755) || die "failed to mkdir '$pilotinstdir'\n";
  }
}

if (defined $sharedcache) { &upgrade_cache_directory_if_needed ($sharedcache); }
&upgrade_cache_directory_if_needed ($cachedir);

&expire_old_cache_files;
&ScoopHTTP::UserAgent::load_logins;
&read_state;

# to do all the conversions at the end:
#&get_all_sites;
#foreach $filekey (@main::filekeys) { &convert_output($filekey); }

# to do them as each site is scooped:
&get_all_sites (1);

&write_state;
&ScoopHTTP::UserAgent::save_logins;
$main::cookie_jar->save ("$user_tmpdir/cookies");

&verbose ("Finished!");
&cleanexit;

# ---------------------------------------------------------------------------

sub ParseConfig {
  my $postproc = undef;
  my $postproctype = undef;

  foreach $_ (@conf) {
    $confline = shift @conflines;
    s/#.*$//; s/^\s+//; s/\s+$//g; next if (/^$/);
    if (!defined $confline) { &main::dbg ("oops! confline not set for $_"); }

    # process environment variable references: ${ENVVARNAME}
    # &main::dbg ("variable ref in site file: $1");
    s/\$\{(\S+?)\}/
	  defined($ENV{$1}) ? $ENV{$1} : "";
    /ge;
    s/\$HOME/$ENV{'HOME'}/ge;		# always supported

    if (defined $postproctype) {
      $postproc .= $_;
      # see if it's the end of the postproc statement scope
      $x = $postproc; 1 while ($x =~ s/\{[^\{\}]*\}//gs);	#{
      if ($x =~ /\}\s*$/) {
	if ($postproctype eq 'Story') {				#{
	  $postproc =~ /^(.*)\}\s*$/; $story_postproc{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype eq 'StoryPre') {			#{
	  $postproc =~ /^(.*)\}\s*$/; $story_preproc{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype =~ /LinksPre (\d+)/) {		#{
	  $postproc =~ /^(.*)\}\s*$/; $links_preproc{"$1 $curkey"} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype eq 'URL') {			#{
	  $postproc =~ /^(.*)\}\s*$/; $url_postproc{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
	elsif ($postproctype eq 'Eval') {			#{
	  $postproc =~ /^(.*)\}\s*$/; $eval_code{$curkey} = $1;
	  $postproc = undef;
	  $postproctype = undef;
	}
      }
      next;
    }

    s/^(\S+:)\s+/$1 /;		# easier to read this way ;)
    /^ProxyHost: (.*)$/ and ($proxyhost = $1), next;
    /^ProxyPort: (.*)$/ and ($proxyport = $1+0), next;
    /^TmpDir: (.*)$/ and ($tmpdir = $1), next;
    /^SitescooperDir: (.*)$/ and ($sitescooperdir = $1), next;
    if (/^SitesDir: (.*)$/) { $sitesdir = $1; next; }

    /^MakeDoc: (.*)$/ and ($makedoc = $1), next;
    /^iSilo: (.*)$/ and ($isilo = $1), next;
    /^HTML2Doc: (.*)$/ and ($richreader = $1), next;
    /^Diff: (.*)$/ and ($diff = $1), next;
    /^TextSaveDir: (.*)$/ and ($outdir = $1), next;
    /^PilotInstallDir: (.*)$/ and ($pilotinstdir = $1), next;
    /^PilotInstallApp: (.*)$/ and ($pilotinstapp = $1), next;
    /^SharedCacheDir: (.*)$/ and ($sharedcache = $1), next;
    /^CachedPageLifetime: (.*)$/ and (!$main::refresh) and
    		($main::cached_front_page_lifetime = ($1+0) / (24*60)), next;
    /^ExpireCacheAfter: (.*)$/ and ($expiry_days = $1+0.0), next;

    if (/^URL: (.*)$/) {
      &FinishConfigSection ($sect, $url);
      $url = &expand_url_magic ($1); $sect = '';

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }
      push (@sites, $url);
      &SetDefaultConfigForURL ($url);
      $site_defined_at{$url} = $confline;
      $curkey = $url;
      next;
    }

    # LayoutURL is similar to URL, but defines a layout for a specific
    # pattern. If an URL falls within this pattern, and parameters are
    # defined for this layout but not defined by the site file, the
    # layout parameters will be used.
    #
    if (/^LayoutURL: (.*)$/) {
      &FinishConfigSection ($sect, $url);
      $url = &expand_url_magic ($1); $sect = '';

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      if (!defined $site_defined_at{$url}) {
	# allow extra parameters to be added to an existing layout
	&SetDefaultConfigForURL ($url);
	push (@unsorted_layouts, $url);
      }

      $site_defined_at{$url} = $confline;
      $curkey = $url;
      next;
    }

    # ExceptionURL is like LayoutURL, but it takes priority over
    # both LayoutURL and the normal site file rules. This way you
    # can define bits of a site that uses different layouts, caching
    # rules etc. by matching pages' URLs against the ExceptionURL
    # regular expression.
    #
    if (/^ExceptionURL: (.*)$/) {
      &FinishConfigSection ($sect, $url);
      $url = &expand_url_magic ($1); $sect = '';

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      if (!defined $site_defined_at{$url}) {
	# allow extra parameters to be added to an existing exception
	&SetDefaultConfigForURL ($url);
	push (@unsorted_exceptions, $url);
      }

      $site_defined_at{$url} = $confline;
      $curkey = $url;
      next;
    }

    if (!defined $curkey || $curkey eq '') {
      my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g;
      die "Configuration line invalid (outside URL scope?) in $line:\n  $_\n";
    }

    /^Name: (.*)$/ and ($name{$curkey} = $1), next;
    /^Description: (.*)$/ and ($desc{$curkey} = $1), next;
    /^Active: (.*)$/ and ($active{$curkey} = $1+0), next;
    /^Levels: (.*)$/ and ($levels{$curkey} = $1-2), next;
    /^AddURL: (.*)$/ and ($extra_urls{$curkey} .= ' '.&expand_url_magic($1)), next;
    /^RequireCookie: (.*)$/ and ($req_cookie{$curkey} = $1), next;

    /^Level(\d+)LinksStart: (.*)$/ and ($links_start{($1-2)." $curkey"} = $2), next;
    /^Level(\d+)LinksEnd: (.*)$/     and ($links_end{($1-2)." $curkey"} = $2), next;
    /^Level(\d+)Print: (.*)$/      and ($links_print{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)TrimAfterLinks: (.*)$/   and ($links_trim{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)Cache?able: (.*)$/     and ($cacheable{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)Diff: (.*)$/        and ($links_diff{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)UseTableSmarts: (.*)$/ and ($use_table_smarts{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)FollowLinks: (.*)$/ and ($links_follow_links{($1-2)." $curkey"} = $2+0), next;
    /^Level(\d+)AddURL: (.*)$/ and
    		($extra_urls{($1-2)." $curkey"} .= ' '.&expand_url_magic($2)), next;

    if (/^Level(\d+)URL: (.*)$/) {
      my $lev = $1;
      my $pat = $2;
      if (!defined ($links_limit_to{($lev-2)." $curkey"})) {
	$links_limit_to{($lev-2)." $curkey"} = "($pat)";
      } else {
	$links_limit_to{($lev-2)." $curkey"} =~ s/\)$/|$pat)/g;
      }
      next;
    }

    /^IssueLinksStart: (.*)$/ and ($links_start{"1 $curkey"} = $1), next;
    /^IssueLinksEnd: (.*)$/     and ($links_end{"1 $curkey"} = $1), next;
    /^IssuePrint: (.*)$/      and ($links_print{"1 $curkey"} = $1+0), next;
    /^IssueTrimAfterLinks: (.*)$/   and ($links_trim{"1 $curkey"} = $1+0), next;
    /^IssueCache?able: (.*)$/     and ($cacheable{"1 $curkey"} = $1+0), next;
    /^IssueDiff: (.*)$/        and ($links_diff{"1 $curkey"} = $1+0), next;
    /^IssueUseTableSmarts: (.*)$/ and ($use_table_smarts{"1 $curkey"} = $1+0), next;
    /^IssueFollowLinks: (.*)$/ and ($links_follow_links{"1 $curkey"} = $1+0), next;
    /^IssueAddURL: (.*)$/ and
    		($extra_urls{"1 $curkey"} .= ' '.&expand_url_magic($1)), next;

    # Normally Issue-level stuff is the highest level, so this would seem to
    # be irrelevant as we never would have to decide whether a URL is the
    # issues page since it's provided in the site file. However the
    # IssueFollowLinks parameter provides a need for this.
    if (/^IssueURL: (.*)$/) {
      my $pat = $1;
      if (!defined ($links_limit_to{"1 $curkey"})) {
	$links_limit_to{"1 $curkey"} = "($pat)";
      } else {
	$links_limit_to{"1 $curkey"} =~ s/\)$/|$pat)/g;
      }
      next;
    }

    if (/^ContentsFormat: (.*)$/) {
      my $fmt = $1;
      if ($fmt eq 'rss') {
	# set up defaults for a Rich Site Summary site.
	# cf. http://my.netscape.com/publish/
	$site_format{$url} = 'rss';
	$links_start{"0 $curkey"} = '(<rdf:RDF|<rss version=|<scriptingNews)';
	$links_end{"0 $curkey"} = '(</rdf:RDF>|</rss>|</scriptingNews>)';
	$links_diff{"0 $curkey"} = 1;
	$levels{$url} = 0;

      } elsif ($fmt eq 'html') {
	# the default -- do nothing.

      } else {
	&sitewarn_file_line ($confline, "Unrecognised ContentsFormat: $_\n");
      }
      next;
    }

    /^ContentsStart: (.*)$/   and ($links_start{"0 $curkey"} = $1), next;
    /^ContentsEnd: (.*)$/       and ($links_end{"0 $curkey"} = $1), next;
    /^ContentsPrint: (.*)$/   and ($links_print{"0 $curkey"} = $1+0), next;
    /^ContentsTrimAfterLinks: (.*)$/   and ($links_trim{"0 $curkey"} = $1+0), next;
    /^ContentsCache?able: (.*)$/  and ($cacheable{"0 $curkey"} = $1+0), next;
    /^ContentsSkipURL: (.*)$/  and ($links_skip{"0 $curkey"} = $1), next;
    /^ContentsDiff: (.*)$/     and ($links_diff{"0 $curkey"} = $1+0), next;
    /^ContentsUseTableSmarts: (.*)$/ and ($use_table_smarts{"0 $curkey"} = $1+0), next;
    /^ContentsFollowLinks: (.*)$/	and ($links_follow_links{"0 $curkey"} = $1+0), next;
    /^ContentsAddURL: (.*)$/ and
    		($extra_urls{"0 $curkey"} .= ' '.&expand_url_magic($1)), next;

    if (/^ContentsURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);

      if (!defined ($links_limit_to{"0 $curkey"})) {
	$links_limit_to{"0 $curkey"} = "($pat)";
      } else {
	$links_limit_to{"0 $curkey"} =~ s/\)$/|$pat)/g;
      }
      next;
    }

    /^StoryStart: (.*)$/	and ($story_start{$curkey} = $1), next;
    /^StoryEnd: (.*)$/		and ($story_end{$curkey} = $1), next;
    /^StoryCache?able: (.*)$/	and ($cacheable{"s $curkey"} = $1+0), next;
    /^StoryDiff: (.*)$/		and ($story_diff{$curkey} = $1+0), next;
    /^StorySkipURL: (.*)$/	and ($story_skip{$curkey} = $1), next;
    /^StoryHeadline: (.*)$/	and ($head_pat{$curkey} = $1), next;
    /^StoryToPrintableSub: (.*)$/	and ($printable_sub{$curkey} = $1), next;
    /^(Story|)UseTableSmarts: (.*)$/ and ($use_table_smarts{$curkey} = $2+0), next;
    /^StoryFollowLinks: (.*)$/	and ($story_follow_links{$curkey} = $1+0), next;
    /^StoryLifetime: (.*)$/	and ($story_lifetime{$curkey} = $1+0), next;
    /^StoryHTMLHeader: (.*)$/ and ($story_html_header{$curkey} = $1), next;
    /^StoryHTMLFooter: (.*)$/ and ($story_html_footer{$curkey} = $1), next;
    /^StoryAddURL: (.*)$/ and
    		($extra_urls{"-1 $curkey"} .= ' '.&expand_url_magic($1)), next;

    if (/^UseAltTagForURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $use_alt_tags{$curkey} = &AddRegexpToSet ($use_alt_tags{$curkey}, $pat);
      next;
    }

    if (/^NeedLoginURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $need_login_url{$curkey} = &AddRegexpToSet ($need_login_url{$curkey}, $pat);
      next;
    }

    if (/^StoryURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $story_limit_to{$curkey} = &AddRegexpToSet ($story_limit_to{$curkey}, $pat);
      next;
    }

    if (/^ImageURL: (.*)$/) {
      my $pat = &AddHostToURL ($curkey, $1);
      $imageurl{$curkey} = &AddRegexpToSet ($imageurl{$curkey}, $pat);
      next;
    }

    if (/^(URL)Process: (.*)$/) {
      my $type = $1;
      my $val = $2;
      if ($val =~ s/^\{//) #}
      {
	$postproctype = $type;
	$postproc = $val;
      } else {
	if ($type eq 'URL') { $url_postproc{$curkey} = $val; }
      }
      next;
    }

    if (/^(Story)PostProcess: (.*)$/) {
      my $type = $1;
      my $val = $2;
      if ($val =~ s/^\{//) #}
      {
	$postproctype = $type;
	$postproc = $val;
      } else {
	if ($type eq 'Story') { $story_postproc{$curkey} = $val; }
      }
      next;
    }

    if (/^EvaluatePerl: (.*)$/) {
      my $val = $2;

      if ($val =~ s/^\{//) #}
      {
	$postproctype = "Eval";
	$postproc = $val;
      } else {
	$eval_code{$curkey} = $val;
      }
      next;
    }

    if (/^(Contents|Issue|Level\d+)HTMLPreProcess: (.*)$/) {
      my $type = $1;
      my $val = $2;

      my $lev;
      ($type eq 'Contents') &&	($lev = 0);
      ($type eq 'Issue') &&	($lev = 1);
      ($type =~ /Level(\d+)/) && ($lev = $1-2);

      if ($val =~ s/^\{//) #}
      {
	$postproctype = "LinksPre $lev";
	$postproc = $val;
      } else {
	$links_preproc{"$lev $curkey"} = $val;
      }
      next;
    }

    if (/^(Story)HTMLPreProcess: (.*)$/) {
      my $type = $1;
      my $val = $2;
      ($type eq 'Story') && ($type = 'StoryPre');
      if ($val =~ s/^\{//) #}
      {
	$postproctype = $type;
	$postproc = $val;
      } else {
	if ($type eq 'StoryPre') { $story_preproc{$curkey} = $val; }
      }
      next;
    }

    if (/^Section: (.*)$/) {
      &FinishConfigSection ($sect, $url);
      $sect = $1;

      if ($sect !~ m,^(http|file)://,i) {
	if ($sect !~ m,^/,i) {
	  $sect = 'http://'.$sect;
	} else {
	  $url =~ m,((http|file)://[^/]+)/,; $sect = $1.$sect;
	}
      }
      if ($sect =~ m,(http|file)://[^/]+$,) { $sect .= '/'; }
      $sect = &expand_url_magic ($sect);
      $sections{$url} .= "|||$sect";
      $levels{$sect} = $levels{$url};
      $active{$sect} = 1;
      $extra_urls{$sect} = '';
      $site_defined_at{$sect} = $confline;
      $curkey = $sect;
      next;
    }

    my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1" line $2/g;
    &sitewarn_file_line ($confline, "Unrecognised in $line:\n  $_\n");
  }

  if (defined $postproctype) {
    &sitewarn_file_line ($confline,
	  "Fell off end of ${postproctype}PostProcess statement!\n");
  }

  &FinishConfigSection ($sect, $url);
  undef @conf;
  undef @conflines;
}

# ---------------------------------------------------------------------------

sub find_sites { push(@found,$File::Find::name) if (-f $_ && /\.site?$/i); }

sub ReadSitesDir {
  my ($file, $key);
  my %sites_restrict_to = ();
  %read_sites = ();

  @site_files_to_read = ();
  if ($#sites_restrict_to >= 0) {
    &verbose ("Restricting to sites: ".join (' ', @sites_restrict_to));
    foreach $key (@sites_restrict_to) {
      if (-r $key) {
	# it's a site file, not a name, add it to the list
	push (@site_files_to_read, $key);
      }

      $sites_restrict_to{$key} = 1;
    }

  } elsif ($#main::sites_additional >= 0) {
    # only scoop sites from the site_choices list if the -site argument
    # was not used.
    #
    &verbose ("Adding site: ".join (' ', @main::sites_additional));
    foreach $key (@main::sites_additional) {
      if (-r $key) { push (@site_files_to_read, $key); }
    }
  }

  foreach $key (@main::layout_site_files) {
    &verbose ("Adding layout: $key");
    if (-r $key) { push (@site_files_to_read, $key); }
  }

  if ($#main::cmdline_urls >= 0) {
    # we're only snarfing the command-line URLs, skip the predefined sites
    return;
  }

  if (defined $sitesdir) {
    @found = (); find(\&find_sites, $sitesdir);
    foreach $file (@found) {
      next if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);	# skip backups, etc.
      next if (-d $file);		# skip directories

      if ($#sites_restrict_to >= 0) {
	my $base = $file; $base =~ s,^.*[\/\\:]([^\/\\:]+)$,$1,g;
	&main::dbg ("checking if site file is wanted: $file");
	next unless (defined $sites_restrict_to{$base}
		|| defined $sites_restrict_to{$file}
		|| $file =~ /layouts\.site/i);
      }
      push (@site_files_to_read, $file);
    }
  }

  if ($#site_files_to_read < 0) {
    warn "\n".
      "No sites were read. Perhaps the -site parameter named a non-existent\n".
      "file, or the \"sites\" directory could not be found?\n";
  }

  foreach $file (@site_files_to_read) {
    next if (defined $read_sites{$file});
    $read_sites{$file} = 1;		# don't read the same file twice

    if (open (IN, "< $file")) {
      my $line = 0;
      while (<IN>) {
	push (@conf, $_);
	push (@conflines, "$file:$line"); $line++;
      }
      close IN;
      &verbose ("Scooping site from file \"$file\".");

    } else {
      &sitewarn_file_line ("$file:0", "Cannot read $file\n");
    }
  }
}

# ---------------------------------------------------------------------------

sub ReadSiteChoices {
  my $choicefile = "${tmpdir}${slash}site_choices.txt";

  if (!-r $choicefile) {
    &EditSiteChoices ($choicefile);	# or create it in this case
  }

  &main::verbose ("Using site choices from $choicefile ...");
  open (IN, "<$choicefile") or return;

  my $samplesdir = "${sitescooperdir}${slash}site_samples";
  while (<IN>) {
    if (/^\s*\[\s*x\s*\]/i) {
      while (<IN>) {
	if (/^\s*Filename:\s*(\S+)\s*$/) {
	  $_ = $1; s/\[samples\]/${samplesdir}/g;
	  if (/layouts\.site/) { push (@main::layout_site_files, $_); }
	  else { push (@main::sites_additional, $_); }
	  last;
	}
      }
    }
  }
  close IN;
}

# ---------------------------------------------------------------------------

sub EditSiteChoices {
  my $choicefile = shift;
  my $samplesdir = "${sitescooperdir}${slash}site_samples";

  warn "Creating/editing 'site_choices.txt' file...\n";
  if (!-d $samplesdir) {
    warn "Cannot find 'site_samples' directory, not creating site_choices.\n".
    	"(looked for '$samplesdir')\n";
    return;
  }

  my %chosen_sites = ();
  foreach $_ (@main::sites_additional) { $chosen_sites{$_} = 1; }

  if (!open (CHOICE, ">$choicefile")) {
    warn "Cannot create 'site_choices.txt' file $choicefile\n";
    return;
  }

  print CHOICE <<EOHDR;
Please pick the site files you wish to use here.  Put an X in the box
beside the sites you wish to scoop.

If you want to use the traditional 'sites' directory, or you have your own
site files not in this list, then do not put an X in any of the boxes.
Sitescooper will supplement what you have ticked here with the contents
of your 'sites' directory, if it exists.

EOHDR

  @found = (); find(\&find_sites, $samplesdir);

  $samplespat = $samplesdir;
  $samplespat =~ s/([^-_:A-Za-z0-9])/\\$1/g;
  
  foreach $file (@found) {
    $pretty = $file; $pretty =~ s,^${samplespat},\[samples\],g;

    if ($file =~ /layouts\.site/) {
      print CHOICE "    [x] (Site layouts for common sites)\n".
	      "\tFilename: $pretty\n\n";
      next;
    }

    ($url, $name, $desc) = &ReadSiteForChoices($file);
    if (!defined $url) { next; }
    if (!defined $name) { $name = $url; }

    if (defined $desc) { $desc = "\t($desc)\n"; }
    else { $desc = ''; }

    $chosen = ' ';
    if (defined $chosen_sites{$pretty}) { $chosen = 'x'; }

    print CHOICE "    [$chosen] $name\n\tURL: $url\n".
    		"\tFilename: $pretty\n$desc\n";
  }

  close CHOICE or die "failed to write to site_choices file";

  my $edit;
  if (defined $ENV{'EDITOR'}) {
    $edit = $ENV{'EDITOR'};
  } elsif (&Portability::MyOS eq 'UNIX') {
    $edit = 'vi';
  } elsif (&Portability::MyOS eq 'Win32') {
    $edit = 'notepad.exe';
  } elsif (&Portability::MyOS eq 'MacOS') {
    # REVISIT -- don't know what to do here ;)
    warn "\nIf you wish to choose which sites to scoop from a list\n".
    	"of pre-defined sites, stop this script now, edit the file\n".
	"$choicefile\n".
	"and re-run it.\n\n";
    return;
  }

  warn "Running editor for site_choices.txt file using command $edit...\n";
  system ($edit, $choicefile);

  if (($? >> 8) != 0) {
    die "The command failed. Please edit $choicefile\n".
	"by hand and re-run sitescooper.\n\n";
  }
}

# ---------------------------------------------------------------------------

sub ReadSiteForChoices {
  my $file = shift;
  return if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);
  return if (-d $file);

  open (IN, "<$file") || next;
  my ($url, $sitename, $desc);
  $url = $sitename = $desc = undef;
  while (<IN>) {
    s/*$//g; s/#.*$//g;
    /^\s*Name:\s*(.*)$/ and ($sitename = $1), next;
    /^\s*Description:\s*(.*)$/ and ($desc = $1), next;
    /^\s*URL:\s*(.*)$/ and ($url = $1), next;
  }
  close IN;

  ($url, $sitename, $desc);
}

# ---------------------------------------------------------------------------
# Default configuration for a newly-specified URL.

sub SetDefaultConfigForURL {
  my $url = shift;

  $sections{$url} = "";		# none yet
  $active{$url} = 1;		# active by default
  $name{$url} = $url;		# default name - the URL
  $use_table_smarts{$url} = 1;	# use smarts
  $levels{$url} = -1;		# 1-level site
  $extra_urls{$url} = '';	# no extra URLs
  $story_lifetime{$url} = 90;	# dont scoop stories older than 3 months
  $links_trim{"0 $url"} = 1024;	# trim after last href + 1024 chars (contents)
  $links_trim{"1 $url"} = 1024;	# trim after last href + 1024 chars (issue)

  # default limit to articles at the same site
  $url =~ m,^((http|file)://[^/]*/),i;
  if (defined $1) {
    $story_limit_to{$url} = $1.'.*';
  } else {
    &sitewarn_file_line ($confline,
    	"Unsupported URL protocol for URL '".$url."'.\n");
  }
}

# ---------------------------------------------------------------------------
# Incorporate defaults from the main URL into each Section.
#
sub FinishConfigSection {
  my $sect = shift;
  my $url = shift;

  if ($sect ne '') {
    if (!defined $name{$sect}) { $name{$sect} = $url; }
    if (!defined $desc{$sect}) { $desc{$sect} = $url; }
    if (!defined $story_start{$sect}) { $story_start{$sect} = $story_start{$url}; }
    if (!defined $story_end{$sect}) { $story_end{$sect} = $story_end{$url}; }
    if (!defined $head_pat{$sect}) { $head_pat{$sect} = $head_pat{$url}; }
    if (!defined $printable_sub{$sect})
		{ $printable_sub{$sect} = $printable_sub{$url}; }
    if (!defined $use_alt_tags{$sect})
		{ $use_alt_tags{$sect} = $use_alt_tags{$url}; }
    if (!defined $imageurl{$sect})
		{ $imageurl{$sect} = $imageurl{$url}; }
    if (!defined $story_limit_to{$sect})
		{ $story_limit_to{$sect} = $story_limit_to{$url}; }
    if (!defined $story_skip{$sect}) { $story_skip{$sect} = $story_skip{$url}; }
    if (!defined $story_diff{$sect}) { $story_diff{$sect} = $story_diff{$url}; }
    if (!defined $story_follow_links{$sect})
    		{ $story_follow_links{$sect} = $story_follow_links{$url}; }
    if (!defined $story_lifetime{$sect})
    		{ $story_lifetime{$sect} = $story_lifetime{$url}; }
    if (!defined $active{$sect}) { $active{$sect} = $active{$url}; }

    # If the main site is disabled, so are the sub-sites.
    if ($active{$url} == 0) {
      $active{$sect} = 0;
    }

    $levels{$sect} = $levels{$url};
    for ($lev = $levels{$url}; $lev >= 0; $lev--)
    {
      if (!defined $links_start{"$lev $sect"}) {
	$links_start{"$lev $sect"} = $links_start{"$lev $url"};
      }
      if (!defined $links_end{"$lev $sect"}) {
	$links_end{"$lev $sect"} = $links_end{"$lev $url"};
      }
      if (!defined $links_skip{"$lev $sect"}) {
	$links_skip{"$lev $sect"} = $links_skip{"$lev $url"};
      }
      if (!defined $links_diff{"$lev $sect"}) {
	$links_diff{"$lev $sect"} = $links_diff{"$lev $url"};
      }
      if (!defined $links_trim{"$lev $sect"}) {
	$links_trim{"$lev $sect"} = $links_trim{"$lev $url"};
      }
      if (!defined $links_print{"$lev $sect"}) {
	$links_print{"$lev $sect"} = $links_print{"$lev $url"};
      }
      if (!defined $links_follow_links{"$lev $sect"}) {
	$links_follow_links{"$lev $sect"} = $links_follow_links{"$lev $url"};
      }
      if (!defined $links_limit_to{"$lev $sect"}) {
	$links_limit_to{"$lev $sect"} = $links_limit_to{"$lev $url"};
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub AddRegexpToSet {
  my $regexp = shift;
  my $pat = shift;
  if (!defined ($regexp)
	      || $regexp !~ /\)$/)
  {
    $regexp = "($pat)";
  } else {
    $regexp =~ s/\)$/|${pat})/g;
  }
  $regexp;
}

# ---------------------------------------------------------------------------

sub make_dirs {
  if (!-d $tmpdir) {
    mkdir ($tmpdir, 0777) || die "failed to mkdir '$tmpdir'\n";
  }
  chdir ($tmpdir) or die "cannot cd to $tmpdir\n";

  $user_tmpdir = "$tmpdir${slash}sitescooper_$userid";

# passwords for sitescooper caches are not fully impled right now!
#
#if ($main::cgimode) {
#open (PWD, "< $user_tmpdir${slash}passwd");
#my $pwd = <PWD>; close PWD;
#my $salt = substr($pwd, 0, 2);
#if (crypt ($main::password, $salt) ne $pwd) {
#&ScoopCGI::passwd_failed; exit;
#}
#}

  if (!-d $user_tmpdir) {
    mkdir ($user_tmpdir, 0777) || die "failed to mkdir '$user_tmpdir'\n";
  }

  if (-f "$user_tmpdir/cookies") {
    $main::cookie_jar->load ("$user_tmpdir/cookies");
  }

  if (!defined $pilotinstdir) {
    $pilotinstdir = "$user_tmpdir${slash}prc";
    if (!-d $pilotinstdir) {
      mkdir ($pilotinstdir, 0777) || die "failed to mkdir '$pilotinstdir'\n";
    }

    if (!$cgimode && !defined $pilotinstapp) {
      &verbose ("Warning: since no PilotInstallDir was specified".
      	" in the configuration,\nI\'ll use $pilotinstdir .\n");
    }
  }

  if ($main::debug) {
    open (LOGFILE, "> $user_tmpdir${slash}log.txt");
    select LOGFILE; $| = 1; select STDOUT;
  }

  if ($outdir eq '') { $outdir = "$user_tmpdir${slash}txt"; }
  if (!-d $outdir) {
    mkdir ($outdir, 0777) || die "failed to mkdir '$outdir'\n";
  }

  $cachedir = "$user_tmpdir${slash}cache"; $newcachedir = $cachedir;
  if (!-d $cachedir) {
    mkdir ($cachedir, 0777) || die "failed to mkdir '$cachedir'\n";
  }

  if (defined $sharedcache) {
    if (!-d $sharedcache) {
      mkdir ($sharedcache, 0777) || die "failed to mkdir '$sharedcache'\n";
    }
  }

  $alreadyseen = "$user_tmpdir${slash}already_seen.txt"; $newalreadyseen = $alreadyseen;

  if ($nowrite) {
    $newcachedir = "$user_tmpdir${slash}new_cache";
    if (!-d $newcachedir) {
      mkdir ($newcachedir, 0777) || die "failed to mkdir '$newcachedir'\n";
    }
    $newalreadyseen = "$user_tmpdir${slash}new_already_seen.txt";
  }

  # check for spaces on Win32 -- MakeDocW can't handle them!
  # Thx to wgoosey /at/ servtech.com for spotting this one.
  if ($main::outputfilter eq 'makedoc') {
    if (&Portability::MyOS eq 'Win32') {
      if ($outdir =~ / /) {
	warn "

Warning: Sitescooper is installed in a directory containing spaces in the
filename. The MakeDocW conversion tool does not support this, so you may
need to move Sitescooper to another directory, e.g. C:\\Sitescooper, for
this conversion to work!  (This is a bug in MakeDOCW.exe.)

";
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub expire_old_cache_files {

  # Don't expire pilotinstdir and textsavedir -- it is too risky if
  # the user has made a typo in selecting the directories for this.
  #
  #if (defined $pilotinstdir) {
    #sub expire_prcdir { unlink if (-f $_ && -M $_ > $main::expiry_days); }
    #find(\&expire_prcdir, $pilotinstdir);
  #}

  #if (defined $outdir) {
    #if (!$main::use_convert_tool) {
      #sub expire_outdir { unlink if (-f $_ && -M $_ > $main::expiry_days); }
      #find(\&expire_outdir, $outdir);
    #}
  #}

  sub expire_cache { unlink if (-f $_ && -M $_ > $main::expiry_days); }
  find(\&expire_cache, $cachedir);

  if (defined $sharedcache) {
    sub expire_shared_cache { unlink if (-f $_ && -M $_ > $main::expiry_days); }
    find(\&expire_shared_cache, $sharedcache);
  }
}

# ---------------------------------------------------------------------------

sub read_state {
  if ($main::refresh == 0) {
    if (!open (IN, "< $alreadyseen")) {
      &verbose ("Cannot read $alreadyseen, creating a new one");

    } else {
      my $url;
      my $mod;
      my $urlhost;
      while (<IN>) {
	/ lastmod=(\d+)$/;
	next if (!defined $1);
	$url = $`; $mod = $1;
	$already_seen{$url} = 1;
	$last_modtime{$url} = $mod+0;

	if ($url =~ m,http://(\S+?)/,) {
	  $urlhost = $1;
	  if (defined($main::oldest_already_seen_this_site{$urlhost})
	      ? $main::oldest_already_seen_this_site{$urlhost} > $mod : 1)
	  {
	    $main::oldest_already_seen_this_site{$urlhost} = $mod;
	  }
	}
      }
      close IN;
    }
  }
}

# ---------------------------------------------------------------------------

sub generate_output_filenames {
  my @sites = @_;
  my %already_done = ();

  foreach $site (@sites) {
    my @urls = ($site);
    if ($sections{$site} ne "") {
      @urls = split (/\|\|\|/, $sections{$site});
    }

    foreach $url (@urls) {
      next if ($url eq '');

      if (defined $req_cookie{$url}) {
	($cookie_host, $cookie_key) = split (' ', $req_cookie{$url});

	$gotit = 0;
	sub chk_for_reqd_cookie {
	  #&main::dbg ("checking cookies: $_[4], $_[1]");
	  if ($_[4] eq $cookie_host && $_[1] eq $cookie_key) { $gotit = 1; }
	}
	$main::cookie_jar->scan (\&chk_for_reqd_cookie);

	if (!$gotit) {
	  my $line = $confline; $line =~ s/^(.*):(.*?)$/"$1"/g;
	  &verbose ("Cookie from $cookie_host is not imported, not scooping $line.");
	  $active{$url} = 0;
	}
      }

      next unless ($active{$url} == 1);

      my $filekey = $site.$url;

      $sitename = $name{$site};
      if (!defined $sitename) { $sitename = $url; }

      $sectname = '';
      if ($site ne $url) { $sectname = "_".$name{$url}; }

      my $filedesc = $filename_template;
      $filedesc =~ s/Site/${sitename}/g;
      $filedesc =~ s/Section/${sectname}/g;
      $filedesc =~ s/[^-_A-Za-z0-9]+/_/g;
      $filedesc =~ s/^[ _]+//g; $filedesc =~ s/[ _]+$//g;

      if (&Portability::MyOS eq 'Mac') {
	# try to limit the filename to 32 characters
	$filedesc =~ s/^(.{26}).*$/$1/g;
      }

      $outfile = $outdir.$slash.$filedesc.'.pages';

      if (&writing_html) {
	$outidxfile = $filedesc.'.html';
      } else {
	$outidxfile = $filedesc.'.txt';
      }

      next if (defined $already_done{$outfile});
      $already_done{$outfile} = 1;

      $outtmp = $outdir.$slash.$filedesc.'.tmp';

      $sectname =~ s/_+$//; $sectname =~ s/^_+//;
      my $secttitle = $prc_title;
      $secttitle =~ s/Site/${sitename}/g;
      $secttitle =~ s/Section/${sectname}/g;
      $secttitle =~ s/^[ _]+//g; $secttitle =~ s/[ _]+$//g;

      $main::key2tmp{$filekey} = $outtmp;
      $main::key2outfile{$filekey} = $outfile;
      $main::key2outidxfile{$filekey} = $outidxfile;
      $main::key2title{$filekey} = $secttitle;
      $main::key2sitename{$filekey} = $sitename;
      $main::key2site{$filekey} = $site;
      $main::key2url{$filekey} = $url;
      push (@main::filekeys, $filekey);

      if ($main::dumpprc) {
	$main::key2syncfile{$filekey} = $outtmp;	# reuse it!
      } else {
	$main::key2syncfile{$filekey} =
		    $pilotinstdir.$slash.$filedesc.'.prc';
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub get_all_sites {
  my $convert_now = shift;
  if (!defined $convert_now) { $convert_now = 0; }

  foreach $filekey (@main::filekeys) {
    $outfile = $key2outfile{$filekey};
    $outtmp = $key2tmp{$filekey};
    my $outidxfile = $key2outidxfile{$filekey};
    my $secttitle = $key2title{$filekey};
    my $sitename = $key2sitename{$filekey};
    my $site = $key2site{$filekey};
    my $url = $key2url{$filekey};

    if (!defined $sitename) { $sitename = $url; }

    # This apparently is needed on MacOS. Sounds unlikely, but there it
    # is...
    if (&Portability::MyOS eq 'Mac') {
      my $parentdir = dirname($outfile);
      if (!-d $parentdir) { mkdir ($parentdir, 0755); }
    }

    $_ = $site_defined_at{$url}; /^(.*):/; my $site_file_name = $1;
    &verbose ("SITE START: now scooping site \"$site_file_name\".");

    &main::dbg ("tmp dir: $outtmp, output dir: $outfile");
    (-d $outtmp) && &rmtree ($outtmp);

    mkdir ($outtmp, 0755) || die "cannot mkdir $outtmp\n";
    &clear_page_tmpfiles;
    $main::output_filename = $outtmp.$slash.$outidxfile;

    # evaluate perl code for this site.
    my $proc = get_layout_param ('eval_code', $url, $url);
    if (defined $proc) {
      my $skip_site = 0;
      if (!eval $proc."; 1;") {
	&sitewarn("EvaluatePerl failed: $@");
      } else {
	if ($skip_site) {
	  &main::dbg ("skip_site set, skipping this site.");
	  next;
	}
      }
    }

    %main::output_file = ();
    $main::output_file{'MAIN'} = '';
    %main::output_links_snarfed = ();
    %main::oldest_already_seen_this_site = ();
    @main::output_story_urls = ();
    my $upindex = $main::current_story_index = 0;

    my $hdr;
    if ($main::writeheader) {
      if ($main::outstyle == $OUT_HTML) {
	$hdr = "<html><head><title>$secttitle</title></head>".
		"<body><h1>$secttitle</h1>\n";
      } else {
	$hdr = "$secttitle\n\n\n";
      }

    } else {
      if ($main::outstyle == $OUT_HTML) {
	$hdr = "<html><head></head><body>\n";
      } else {
	$hdr = "";
      }
    }
    $main::output_file{'MAIN'} .= $hdr;

    $stories_found = 0;
    $file_size = 0;
    $hit_file_size_limit = 0;

    &set_got_intr_behaviour ('setflag');
    my $u;
    foreach $u ($url, split (' ', $extra_urls{$url})) {
      # if we were interrupted, clear the flag and go on
      if ($got_intr_flag) { &set_got_intr_behaviour ('setflag'); }
      &add_page_tmpfile ($outtmp.$slash.$outidxfile, $u);

      if ($levels{$url} >= 0) {
	&download_front_page ($u, $url, $levels{$url}, $upindex);
      } else {
	&download_story_page ($u, $url, 1, $upindex);
      }
    }

    # Now go through any additional URLs at the different levels, starting
    # at the highest level and working down.
    #
    my $lev;
    for ($lev = $levels{$url}; $lev >= -1; $lev--)	# -1 = story level
    {
      next unless (defined $extra_urls{"$lev $url"});

      foreach $u (split (' ', $extra_urls{"$lev $url"}))
      {
	# if we were interrupted, clear the flag and go on
	if ($got_intr_flag) { &set_got_intr_behaviour ('setflag'); }
	&add_page_tmpfile ($outtmp.$slash.$outidxfile, $u);

	if ($lev >= 0) {
	  &download_front_page ($u, $url, $lev, $upindex);
	} else {
	  &download_story_page ($u, $url, 1, $upindex);
	}
      }
    }

    &set_got_intr_behaviour ('exit');

    if ($stories_found > 0) {
      &verbose ("$secttitle: $stories_found stories downloaded (".
	      sprintf ("%3.1f", $file_size/1024)." K uncompressed).");

      my $ind = $main::current_story_index;
      foreach $ofkey (keys %main::output_file) {
	# convert sitescooper navigation links: [<<][^][>>]
	my $story = $main::output_file{$ofkey};

	# trim off the first and last ones anyway
	$story =~ s/\[<a href=\"__SITESCOOPER_STORY_(-1|${ind})\">.*?<\/a>\]//g;

	# and run through the rest
	for ($i = 0; $i < $ind; $i++) {
	  next unless (defined $main::output_story_urls[$i]);
	  $story =~ s/\"__SITESCOOPER_STORY_${i}\"/\"${main::output_story_urls[$i]}\"/g;
	}

	# remove stray links
	$main::output_file{$ofkey} = &remove_external_links ($story);
      }

      my $blurb1 = "(End of snarf - copyright retained by original ".
	"providers.";
      my $blurb2 = "Downloaded and converted by sitescooper; see ".
	"$main::home_url )";

      if ($main::writefooter) {
	if ($main::outstyle == $OUT_HTML) {
	  $main::output_file{'MAIN'} .= "\n\n<p><hr><i>$blurb1 $blurb2</i>\n";
	} elsif ($main::outstyle == $OUT_DOC) {
	  $main::output_file{'MAIN'} .= "$blurb1 $blurb2\n";
	} else {
	  $main::output_file{'MAIN'} .= "$blurb1\n$blurb2\n";
	}
      }

      if ($main::outstyle == $OUT_HTML) {
	$main::output_file{'MAIN'} .= "</body></html>\n";
      } elsif ($main::outstyle == $OUT_DOC) {
	$main::output_file{'MAIN'} .= "<$main::bookmark_char>\n";
      }

      open (OUTFILE, "> $main::output_filename")
		      or die "Failed to create $main::output_filename\n";
      print OUTFILE $main::output_file{'MAIN'};
      close OUTFILE or warn "Failed to write to $main::output_filename";

      if (!$main::fileperpage) {
	if ($main::dump) {
	  open (IN, "<".$outtmp.$slash.$outidxfile);
	  while (<IN>) { print STDOUT; }
	  close IN; &rmtree ($outtmp);

	  # ensure we don't try to convert it later
	  undef $main::key2syncfile{$filekey};

	} else {
	  &rmtree ($outfile); rename ($outtmp, $outfile);
	  if ($convert_now) { &convert_output($filekey); }
	}

      } else {
	foreach $_ (keys %main::output_file) {
	  next if ($_ eq 'MAIN');
	  open (OUTFILE, "> $_")
			  or die "Failed to create $_\n";
	  print OUTFILE $main::output_file{$_};
	  close OUTFILE or warn "Failed to write to $_";
	}

	&rmtree ($outfile); rename ($outtmp, $outfile);

	if ($main::dump) {
	  # print the path to the created directory containing the pages
	  print $outfile."\n";
	  # ensure we don't try to convert it later
	  undef $main::key2syncfile{$filekey};

	} else {
	  if ($convert_now) { &convert_output($filekey); }
	}
      }

      &main::dbg ("output dir: $outfile");
      &main::dbg ("output index: ".$outfile.$slash.$outidxfile);

      my ($from, $to);
      while (($from,$to) = each %main::oldest_already_seen_this_site) {
	&main::dbg ("Saving new already_seen age cache entry: $from => $to ".
		"(".&time2datestr($to).")");
	$main::oldest_already_seen{$from} = $to;
      }

    } else {
      close OUTFILE;
      &verbose ("$secttitle: no new stories, ignoring.");
      &main::dbg ("(Not setting already_seen age cache since no links were followed)");

      undef $main::key2syncfile{$filekey};
      &rmtree ($outtmp);
    }

    &verbose ("SITE END: done scooping site \"$site_file_name\".");
    $main::output_file{'MAIN'} = '';
  }
}

# ---------------------------------------------------------------------------

sub convert_output {
  my $filekey = shift;

  return unless ($main::use_convert_tool);

  my $syncfile = $key2syncfile{$filekey};
  return unless defined $syncfile;

  $outfile = $key2outfile{$filekey};
  my $outidxfile = $key2outidxfile{$filekey};
  my $secttitle = $key2title{$filekey};
  unlink $syncfile;

  if ($main::outputfilter eq '__cat__') {
    open (IN, "< ".$outfile);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $outfile;
    return;
  }

  if ($main::outputfilter eq 'makedoc') {
    $cmd = "$makedoc $outfile \"".$syncfile."\" '".$secttitle."'";

  } elsif ($main::outputfilter eq 'isilo') {
    if (&Portability::MyOS eq 'Win32' && $isilo =~ /isilow32/i) {
      $isiloargs .= ' -u';
    }

    if ($main::fileperpage) {
      $cmd = "$isilo $isiloargs $isilomultipageargs ".
      				$outfile.$slash.$outidxfile;
    } else {
      $cmd = "$isilo $isiloargs ".$outfile.$slash.$outidxfile;
    }

    # UNIX iSilo utils take the output filename as well; Win32
    # doesn't need it as it installs as it goes along.
    if (&Portability::MyOS ne 'Win32') {
      $cmd .= " \"".$syncfile."\"";
    }

    # Win32 iSilo only takes the -u arg for the GUI version, not the
    # command line one. Strip the arg for the command-line converter.
    # Also add the output filename.
    if (&Portability::MyOS eq 'Win32' && $cmd =~ /isiloc32/i) {
      $cmd =~ s/ -u / /g;
      $cmd .= " \"".$syncfile."\"";
    }

  } elsif ($main::outputfilter eq 'richreader') {
    $cmd = "$richreader $richargs ".$outfile;

  } elsif ($main::outputfilter =~ /^cmd: /) {
    $cmd = $';

    my $idx = $outfile.$slash.$outidxfile;
    $cmd =~ s/__SCOOPFILE__/${idx}/g;
    $cmd =~ s/__SYNCFILE__/${syncfile}/g;
    $cmd =~ s/__TITLE__/${secttitle}/g;

  } else {
    die "bad output filter $main::outputfilter\n";
  }

  my $keep_outfile = ($main::debug || $main::keep_tmps);

  if (&Portability::MyOS ne 'Mac') {
    &add_cmd_dir_to_path ($cmd);
    &verbose ("Running: $cmd");

    $realwd = getcwd; chdir $outfile;	# cd to conversion dir for command
    system $cmd;
    chdir $realwd;			# back again

    # output a newline, MakeDoc won't do it itself.
    if (&Portability::MyOS eq 'UNIX') { &verbose ("\n"); }

    if (($? >> 8) != 0) {

      # work around a bug in iSilo converter on Win32 -- it
      # reports failure even when the conversion went fine.
      if (&Portability::MyOS ne 'Win32' ||
	    $main::outputfilter ne 'isilo')
      {
	warn "command failed: $cmd\n";
	$main::failed_to_cvt = 1;
      }
    }

  } else {
    # system is broken on MacOS, so print the required command 
    #so it can be run easily from MPW shell
    if (!defined $macos_system_warning_written) {
      warn "[Warning: not using the broken MacPerl system() call. ".
	    "You will need to\ncut and paste the command ".
	    "lines yourself!]\n\n";
      $macos_system_warning_written = 1;
    }
    print $cmd, "\n";
    $keep_outfile = 1;
  }

  if (!$keep_outfile) {
    &rmtree ($outfile);		# don't keep .txt files around
  }

  # If we're dumping, read in the generated file and write it to
  # STDOUT.
  if ($main::dumpprc) {
    open (IN, "< ".$syncfile);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $syncfile;
  }

  # output the name of the finished file. This is handy for scripts
  # which want to collect these files and store them somewhere.
  # REVISIT -- verify that Win32 iSilo uses the same filename.
  if (!$main::dumpprc) {
    if (-r $syncfile) {
      print "Created: ".$syncfile."\n";
    }
  }

  if (defined $pilotinstapp) {
    if ($pilotinstapp =~ /^\*\*\*ADD_TO_MANIFEST\*\*\* (.*)$/) {
      if (!open (OUT, ">> $1")) {
	warn "cannot write to $1\n";
      } else {
	print OUT $syncfile."\n"; close OUT;
      }

    } else {
      $cmd = "$pilotinstapp $syncfile";
      &add_cmd_dir_to_path ($cmd);
      &verbose ("Running: $cmd");
      system $cmd;

      if (($? >> 8) != 0) {
	warn "command failed: $cmd\n";
      }
    }
  }
}

sub add_cmd_dir_to_path {
  local ($_);
  my $cmd = shift;

  # Perl on some Win32 platforms seems to require that the binary be
  # in the PATH.
  #
  if (&Portability::MyOS eq 'Win32') {
    $_ = $cmd;
    if (!/[\\\/]/) { return; }	# foo arg ...
    if (/^\"([^\"]+)\"/) { $cmd = $1; }	# "C:\Program Files\foo.exe" arg ...
    elsif (/^(\S+)\s/) { $cmd = $1; }	# C:\windows\foo.exe arg ...
    else { $cmd = $_; }			# C:\windows\foo.exe

    $cmd =~ s,[\\/][^\\/]+\s*$,,g;		# trim the filename
    $cmdpat = $cmd; $cmdpat =~ s,(\W),\\$1,g;	# escape funny chars

    if ($ENV{'PATH'} !~ /;${cmdpat}(;|$)/) {
      &main::dbg ("Adding directory to command path: $cmd");
      my $path = $ENV{'PATH'} || $ENV{'Path'} || $ENV{'path'};
      $path .= ";$cmd"; $ENV{'PATH'} = $path;
    }
  }
}

# ---------------------------------------------------------------------------

sub write_state {
  if (!$main::failed_to_cvt) {
    # only write alreadyseen if the files converted successfully, otherwise
    # the user may lose some recent news due to a makedoc screwup.
    #
    my $towrite = '';
    my $now = time;
    my $twomonthsago = $now - (24*60*60*30*2);
    my $mod;
    my $urlhost;

    # keep the already-seen list small by cutting out old entries.  We
    # define "old entries" as (a) older than 2 months and (b) older than
    # the oldest link we saw in today's scooping run.
    #
    if (!$main::refresh) {
      &main::dbg ("trying to cut old entries from already-seen URL cache");

      foreach $_ (keys %already_seen) {
	m,http://(\S+?)/,; $urlhost = $1; next unless defined ($urlhost);
	if (defined $last_modtime{$_} && defined $main::oldest_already_seen{$urlhost})
	{
	  $mod = $last_modtime{$_};
	  if ($twomonthsago > $mod && $main::oldest_already_seen{$urlhost} > $mod) {
	    &main::dbg ("stripping old entry: $_ lastmod=$mod (".&time2datestr($mod).")");
	    next;
	  }
	}
	$towrite .= $_." lastmod=".(defined $last_modtime{$_}
		    ? $last_modtime{$_} : $now)."\n";
      }

      if (open (OUT, "> $newalreadyseen")) {
	print OUT $towrite;	# do it as one big atomic write, for safety
	close OUT || warn "Cannot rewrite $newalreadyseen\n";
      } else {
	warn "Cannot rewrite $newalreadyseen\n";
      }

    } else {
      # it's small enough -- so we can just append to it.
      &main::dbg ("appending already-seen URLs to $newalreadyseen");

      foreach $_ (@seen_this_time) {
	$towrite .= $_." lastmod=".(defined $last_modtime{$_}
		    ? $last_modtime{$_} : $now)."\n";
      }

      if (open (OUT, ">> $newalreadyseen")) {
	print OUT $towrite;	# do it as one big atomic write, for safety
	close OUT || warn "Cannot append to $newalreadyseen\n";
      } else {
	warn "Cannot append to $newalreadyseen\n";
      }
    }


    my ($from, $to);
    while (($from,$to) = each %caches_to_rename) {
      &main::dbg ("Saving new cache file: $to");
      rename ($from, $to) or warn ("rename $from -> $to failed\n");
    }
  }
}

# ---------------------------------------------------------------------------
# Note on levels: a 2-level site has a contents page and stories off that;
# 3-level has issue links page, per-issue contents page and stories.
# 1-level has only the story page, no links.

sub download_front_page {
  my $url = shift;
  my $baseurl = shift;
  my $level = shift;
  my $upindex = shift;
  my ($cachefile, $page);
  my $key = "$level $baseurl";

  my $human_level = $level + 2;

  $sitewarn_current_site_line = $site_defined_at{$baseurl};

  # Use this hash to avoid endless loops when scooping multi-page front pages.
  return if (defined $already_seen_this_session{$url});
  $already_seen_this_session{$url} = 1;

  if ($got_intr_flag) { return; }
  if ($hit_file_size_limit) { return; }

  my $pat = $links_limit_to{$key};
  if (defined $pat) {
    if (!match_url ($url, $pat)) {
      &dbg ("front page URL $url does not match $pat, ignoring.");
      return;
    }
  }

  $pat = get_layout_param ('links_skip', $key, $url);
  if (defined $pat) {
    if ($url =~ m#^${pat}$#) {
      &verbose ("Skipping: $url"); return;
    }
  }

  my $origurl = $url;
  $url = &apply_url_postproc($url, $baseurl);
  if (!defined $url) {
    &main::dbg ("URLProcess says URL should be ignored: $origurl"); return;
  }

  my $fullurl = $url; $url = &URLWithoutAnchor ($url);

  &verbose ("Reading level-".($human_level)." front page: $fullurl");
  &set_got_intr_behaviour ('setflag');

  my $is_dynamic_html;
  if (defined $cacheable{$key}) {
    $is_dynamic_html = ($cacheable{$key} == 0);
  } elsif (defined $links_diff{$key} && $links_diff{$key} != 0) {
    $is_dynamic_html = 1;	# pages that need diff'ing are dynamic
  } elsif ($level < $levels{$baseurl}) {
    # second-level or deeper front pages are usually not dynamic, more
    # likely to be a static table of contents.
    $is_dynamic_html = 0;
  } else {
    $is_dynamic_html = 1;	# front pages are usually dynamic
  }

  push (@seen_this_time, $url);
  $already_seen {$url} = 1;
  &check_for_oldest ($url);	# we came across the link, so keep it around

  $page = &get_page ($baseurl, $url, $is_dynamic_html);
  if (!defined $page) {
    &verbose ("Skipping (get_page returned nothing): $fullurl");
    return;
  }
  if ($got_intr_flag) { goto interrupted; }

  my $life = get_layout_param ('story_lifetime', $baseurl, $url);
  if (defined $last_modtime{$url} &&
    			$last_modtime{$url} < $life * 24 * 60 * 60)
  {
    &verbose ("Skipping (contents are older than ".$life." days): $fullurl");
    return;
  }

  my $origpage = $page;
  &journal ("pre_strip_level".($human_level), $page);
  $page = &strip_front_page ($url, $key, $baseurl, $page);
  &journal ("post_strip_level".($human_level), $page);

  my $cachedpage;
  if (defined $links_diff{$key} && $links_diff{$key} != 0) {
    $cachedpage = &strip_front_page ($url, $key, $baseurl,
				&get_cached_page_for_diff ($url));
    $page = &get_new_bits ($cachedpage, $page);
  }

  &cache_page_later ($url, $origpage);

  my $proc = get_layout_param ('links_preproc', $key, $url);
  if (defined $proc) {
    $_ = $page;
    my $site_level = $human_level;
    if (!eval $proc."; 1;") {
      &sitewarn("level-".($human_level)." HTMLPreProc failed: $@");
      # and keep the original $page
    } else {
      $page = $_;
    }
  }

  if (defined fileno JOURNAL) {
    # always write a text-mode version for the journal
    &journal ("to_text_level".($human_level),
    	&html_to_text ($url, $baseurl, $page, $OUT_TEXT));
  }

  my $lprint = get_layout_param ('links_print', $key, $url);
  if ((defined $lprint && $lprint != 0) || &writing_html) {
    $main::warn_about_external_links = 1;
    my $txtpage = &html_to_text ($url, $baseurl, $page, $main::outstyle);
    $main::warn_about_external_links = 0;

    my $outme = 1;
    if ($is_dynamic_html && defined $cachedpage && !$main::refresh) {
      # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
      # HTML. Sometimes the ad banners will be the only things that have
      # changed between retrieves, and html_to_text will have stripped those
      # out.
      my $cachedtxt = &html_to_text ($url, $baseurl, $cachedpage,
      			$main::outstyle);
      if (&text_equals ($txtpage, $cachedtxt)) {
	&verbose ("Not printing contents (text has not changed): $fullurl");
	$outme = 0;
      }
    }

    if ($outme) {
      # if we're only printing the links because we're writing HTML,
      # then use links_trim. (off for the time being!)

      #if (!(defined $lprint && $lprint != 0) && &writing_html) {
	#my $ltrim = get_layout_param ('links_trim', $key, $url);
	#if ($ltrim) {
	  ## trim that number of chars after the last link in the page.
	  #my $eval = '
	    #$txtpage =~ s/(href=.*?<\/a>.{'.$ltrim.'}\S*).*?$/$1
	    #<i>[irrelevant links trimmed]<\/i>/i;';
	  #eval "$eval; 1;" or warn ("trim code eval failed: $@\n$eval\n");
	#}
      #}

      &verbose ("Printing: $fullurl");
      &write_as_story (1, $url, $baseurl, $txtpage, undef, $upindex);

      #if (&writing_html && (!defined $lprint || $lprint == 0))
      if (&writing_html) {
	# don't count the front page as a story if we're just outputting it
	# because we're writing HTML.
	$stories_found--;
      }
    }
  }

  # this is a front page. Pages followed from this page should use this as
  # the "up a level" link.
  $upindex = $main::current_story_index - 1;

  # see if there's any links to extra contents pages
  my @turnoverlinks = &get_contents_turnover_links ($url, $key, $page);

  my @links = ();
  my $wrote_sep = 0;

  # This was all getting a bit tricky, so I've redone it a bit.
  # It now does not try to strip closing tags, as it doesn't have to.
  while (1) {
    if ($got_intr_flag) { goto interrupted; }
    if ($hit_file_size_limit) { last; }

    if (
      $page =~ s/<a\s+[^>]*href=\s*\"([^\">]+)\"//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*\'([^\'>]+)\'//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*([^\s>]+)//is
      )
    {
      my $link = $1;
      push (@links, $link);
      next;
    }

    # support for frames
    if (
      $page =~ s/<frame\s+[^>]*src=\"([^\">]+)\"//is
       ||
      $page =~ s/<frame\s+[^>]*src=\'([^\'>]+)\'//is
       ||
      $page =~ s/<frame\s+[^>]*src=([^\s+>]+)//is
      )
    {
      my $link = $1;
      if (&writing_html) {
	if ($wrote_sep == 0) {
	  $main::output_file{'MAIN'} .= "<p><hr>\n"; $wrote_sep = 1;
	}
	$main::output_file{'MAIN'} .=
		&translate_link ($fullurl, $baseurl, $link, $link). "<br>\n";
      }
      push (@links, $link);
      next;
    }

    # rudimentary support for My-Netscape-style RDF files
    if ($page =~ s/<item>(.*?)<link\s*[^>]*>(.+?)<\/link>(.*?)<\/item>//is)
    {
      my ($title, $link, $title2) = ($1, $2, $3);

      # <link> tags in RSS can contain other crap. Ditch it; we want the link!
      $link =~ s/^.*<url>(.*?)<\/url>.*$/$1/gis;

      $link = &AbsoluteURL ($url, $link);
      if ($title =~ /<title>(.*?)<\/title>/is
	   || $title =~ /<text>(.*?)<\/text>/is
	   || $title2 =~ /<title>(.*?)<\/title>/is
	   || $title2 =~ /<text>(.*?)<\/text>/is)
      {
	$url_title{$link} = $1;
      }

      push (@links, $link);
      next;
    }

    last;		# no more links available
  }

  if ($#links >= 0) {
    &verbose ("Found ".($#links+1)." links, examining them.");
  }

  # now traverse the links and get the stories
  &journal ("links_level".($human_level), join ("\n", @links));
  foreach $_ (@links) {
    if ($hit_file_size_limit) {
      my $msg = "File size limit of $main::filesizelimit K exceeded,".
			  " skipped some stories from this site.";
      &verbose ($msg);
      if (&writing_html) {
	$main::output_file{'MAIN'} .= "<hr><i>$msg</i><br>\n";
      } else {
	$main::output_file{'MAIN'} .= "\n($msg)\n";
      }
      last;
    }

    &follow_front_link ($baseurl, $url, $level, $_, $upindex);
    if ($got_intr_flag) { goto interrupted; }
  }

  # if there's more contents pages, process them as well.
  &journal ("turnover_links_level".($human_level), join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;
    for $link (@turnoverlinks) {
      if ($got_intr_flag) { goto interrupted; }
      $link = &AbsoluteURL ($url, $link);
      &download_front_page ($link, $baseurl, $level);
    }
  }

interrupted:
  &set_got_intr_behaviour ('exit');
}

# ---------------------------------------------------------------------------

sub follow_front_link {
  my ($baseurl, $url, $level, $nextpage, $upindex) = @_;

  $nextpage = &AbsoluteURL ($url, $nextpage);
  return if ($nextpage !~ /^(http|file):/i);	# only supported links

  &main::dbg ("Link found on $baseurl: $nextpage");

  # should we download the next front page?
  if ($level > 0) {
    &download_front_page ($nextpage, $baseurl, $level-1, $upindex);
    return;
  }
  if ($got_intr_flag) { return; }

  # nope, we're onto the stories already
  $nextpage = &make_printable ($baseurl, $nextpage, 1);

  &download_story_page ($nextpage, $baseurl, 0, $upindex);
}

sub make_printable {
  my $baseurl = shift;
  my $nextpage = shift;
  my $warn_if_fail = shift;

  my $sub = get_layout_param ('printable_sub', $baseurl, $nextpage);
  if (defined $sub) {
    my $new = $nextpage;
    $sub =~ s/\\(\d+)/\$$1/g;	# avoid warnings

    eval '$new =~ '.$sub.'; 1;'
      or &sitewarn ("Printable substitution failed! ($!)\n");

    if ($nextpage ne $new) {
      # &verbose ("Using printable version instead: $new");
      my $limitto = $story_limit_to{$baseurl};
      if (defined $limitto && !match_url ($new, $limitto)) {
	if ($warn_if_fail) {
	  &sitewarn ("Printable version does not match StoryURL".
		"pattern, reverting from $new to $nextpage\n");
	}
      } else {
	$nextpage = $new;
      }
    }
  }
  $nextpage;
}

# ---------------------------------------------------------------------------

sub download_story_page {
  my $url = shift;
  my $baseurl = shift;
  my $is_dynamic_html = shift;
  my $upindex = shift;
  my ($cachefile, $page);

  my $fullurl = $url; $url = &URLWithoutAnchor ($url);
  study $url;

  $sitewarn_current_site_line = $site_defined_at{$baseurl};

  my $cacheflag = $cacheable{"s $baseurl"};
  if (defined $cacheflag) {
    # user setting overrides our heuristics
    $is_dynamic_html = ($cacheflag==0);
  }
  if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) {
    $is_dynamic_html = 1;	# diff pages are always dynamic
  }

  my $limitto = $story_limit_to{$baseurl};
  if (defined $limitto) {
    if (!defined $output_storyurl_dbg{$baseurl}) {
      &main::dbg ("StoryURL for $baseurl: $limitto");
      $output_storyurl_dbg{$baseurl} = 1;
    }

    if (!match_url ($url, $limitto)) {
      &main::dbg ("Non-story URL ignored: $fullurl");
      return;
    }
  }
  if ($url =~ m,^(ftp|mailto|https|gopher|pnm)://,) {
    &main::dbg ("Non-story URL ignored: $fullurl");
    return;
  }

  my $origurl = $url;
  $url = &apply_url_postproc($url, $baseurl);
  if (!defined $url) {
    &main::dbg ("URLProcess says URL should be ignored: $origurl"); return;
  }

  $pat = get_layout_param ('story_skip', $baseurl, $url);
  if (defined $pat) {
    if ($url =~ m#^${pat}$#) {
      &verbose ("Skipping: $fullurl"); return;
    }
  }

  &check_for_oldest ($url);

  if (!$is_dynamic_html && $already_seen {$url}) {
    &main::dbg ("skipping, already seen: $fullurl");
    return;
  }

  push (@seen_this_time, $url);
  $already_seen {$url} = 1;

  if ($hit_file_size_limit) { return; }

  &get_story_page ($url, $baseurl, $is_dynamic_html, $upindex);
}

# ---------------------------------------------------------------------------

sub get_story_page {
  my $url = shift;
  my $baseurl = shift;
  my $is_dynamic_html = shift;
  my $upindex = shift;
  my @turnoverlinks;
  my $headline;

  &verbose ("Reading: $url");
  &check_for_oldest ($url);	# we came across the link, so keep it around

  my $cachedpage = undef;
  if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) {
    $cachedpage = &get_cached_page_for_diff ($url);
  } elsif ($is_dynamic_html) {
    $cachedpage = &get_cached_page ($url);
  }
  if (defined $cachedpage) {
    $cachedpage = &strip_story ($url, $baseurl, $cachedpage, " (cached)");
  }

  my $origpage = &get_page ($baseurl, $url, $is_dynamic_html);
  return unless defined $origpage;
  if ($got_intr_flag) { return; }

  # get headline before stripping StoryStart and StoryEnd
  $headline = &get_headline ($url, $baseurl, $origpage);
  &journal ("pre_strip_story", $origpage);
  my $page = &strip_story ($url, $baseurl, $origpage, "");
  &journal ("post_strip_story", $page);

  # TODO -- use get_layout_param here
  if (defined $story_html_header{$baseurl}) {
    $page = $story_html_header{$baseurl} . $page;
  }
  if (defined $story_html_footer{$baseurl}) {
    $page .= $story_html_footer{$baseurl};
  }

  my $proc = get_layout_param ('story_preproc', $baseurl, $url);
  if (defined $proc) {
    $_ = $page;
    my $site_level = 1;
    if (!eval $proc."; 1;") {
      &sitewarn("StoryHTMLPreProc failed: $@");
      # and keep the original $page
    } else {
      $page = $_;
      &journal ("post_story_preproc", $page);
    }
  }

  if (defined $story_diff{$baseurl} && $story_diff{$baseurl}) {
    $page = &get_new_bits ($cachedpage, $page);
    &cache_page_later ($url, $origpage);
  } else {
    &cache_page ($url, $origpage);
  }

  if ($got_intr_flag) { return; }

  if (defined fileno JOURNAL) {
    # always write a text-mode version for the journal
    &journal ("to_text_story",
    	&html_to_text ($url, $baseurl, $page, $OUT_TEXT));
  }

  # get turn-over links after stripping StoryStart and StoryEnd
  @turnoverlinks = &get_story_turnover_links ($url, $baseurl, $page);
  $main::warn_about_external_links = 1;
  $page = &html_to_text ($url, $baseurl, $page, $main::outstyle);
  $main::warn_about_external_links = 0;

  if ($is_dynamic_html && defined $cachedpage && !$main::refresh) {
    # ensure that the cleaned-up HTML doesn't match the cleaned-up cached
    # HTML. Sometimes the ad banners will be the only things that have
    # changed between retrieves, and html_to_text will have stripped those
    # out.
    $cachedpage = &html_to_text ($url, $baseurl,
    			$cachedpage, $main::outstyle);
    if (&text_equals ($page, $cachedpage)) {
      &verbose ("Skipping (text has not changed): $url");
      return;
    }
  }

  my $life = get_layout_param ('story_lifetime', $baseurl, $url);
  if (defined $last_modtime{$url} &&
    		$last_modtime{$url} < $life * 24 * 60 * 60)
  {
    &verbose ("Skipping (story is older than ".$life." days): $url");
    return;
  }

  # ensure there's some alphanumerics in the output text. No alnums means
  # no output. HTML needs to be checked to ensure we don't just pick
  # up tags, which will not be displayed.
  if ((&writing_html && $page !~ /[A-Za-z0-9]\s*</ &&
    		$page !~ />\s*[A-Za-z0-9]/ && $page !~ /^\s*[A-Za-z0-9]/)
    || (!&writing_html && $page !~ /[A-Za-z0-9]/))
  {
    &verbose ("Skipping (no text to write): $url");
    return;
  }

  if ($levels{$baseurl} < 0) {
    # this is a one-level site: therefore the story should be treated
    # as the "front page". Thx Carsten for this one.
    &write_as_story (1, $url, $baseurl, $page, $headline, $upindex);
  } else {
    &write_as_story (0, $url, $baseurl, $page, $headline, $upindex);
  }

  &journal ("turnover_links_story", join ("\n", @turnoverlinks));
  if ($#turnoverlinks >= 0) {
    my $link;
    for $link (@turnoverlinks) {
      if ($got_intr_flag) { return; }
      $link = &AbsoluteURL ($url, $link);
      &download_story_page ($link, $baseurl, 0, $upindex);	# right now
    }
  }
}

# ---------------------------------------------------------------------------

sub apply_url_postproc {
  local ($_) = shift;
  my $baseurl = shift;

  my $proc = get_layout_param ('url_preproc', $baseurl, $_);
  if (defined $proc{$baseurl}) {
    if (!eval $proc{$baseurl}."; 1;") {
      &sitewarn("URLProcess failed: $@");
      undef $_;
    }
  }
  $_;
}

# ---------------------------------------------------------------------------

sub clean_pre_tags_for_diff {
  my $file = shift;
  my $pre_nl_tag = shift;
  my $pre_pre_tag = shift;
  my $pre_slashpre_tag = shift;

  my $start = '';
  my $end = '';

  ($file =~ s/^(.*)<pre>//i) and $start = $1;
  ($file =~ s/<\/pre>(.*)$//i) and $end = $1;
  $file =~ s/\n/${pre_nl_tag}/gs;

  $start.$pre_pre_tag.$file.$pre_slashpre_tag.$end;
}

sub get_new_bits {
  local ($_);
  my ($oldfile, $newfile) = @_;

  if ($main::refresh) {
    &verbose ("-refresh is on, not looking for differences");
    return $newfile;
  }

  if (!defined $oldfile || $oldfile =~ /^\s*$/) {
    if (!$main::debugdiffs) { return $newfile; }
    $oldfile = '';
  }

  &verbose ("Finding differences between current page and cached version");

  # it's important to keep these names 8.3 for Windows-95 compatibility,
  # as some Windoze diffs may not be able to handle them otherwise!
  # This also requires that we are chdir'd into the temporary directory
  # to avoid hassles with long filenames in the args when we run the
  # diff command. What a pain!
  #
  my $oldf = "a$$.tmp";		# we are already chdir'ed
  my $newf = "b$$.tmp";

  if ($main::debugdiffs) {
    $oldf = "diff_old.tmp";
    $newf = "diff_new.tmp";
  }

  # Split the file lines at probable story-header endpoints.
  # This makes them more amenable to diffing, hopefully without
  # losing bits we don't want to lose, or gaining bits we don't
  # want to gain. Also try to keep cross-line-split HTML tags
  # together.

  # preserve newlines in <pre> text
  my $cleaned_pre_nls = 0;
  my $pre_nl_tag = "<!!!n>";
  my $pre_pre_tag = "<!!!pre>";
  my $pre_slashpre_tag = "<!!!/pre>";

  while ($oldfile =~ /<pre>/i) {
    $oldfile = &clean_pre_tags_for_diff ($oldfile,
    			$pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag);
    $cleaned_pre_nls = 1;
  }

  while ($newfile =~ /<pre>/i) {
    $newfile = &clean_pre_tags_for_diff ($newfile,
    			$pre_nl_tag, $pre_pre_tag, $pre_slashpre_tag);
    $cleaned_pre_nls = 1;
  }

  # canonicalise all other newlines (we control the vertical!)
  $oldfile =~ s/\s*[\r\n]+\s*/ /gs;
  $newfile =~ s/\s*[\r\n]+\s*/ /gs;

  # remove extraneous whitespace from inside tags
  $oldfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies;
  $newfile =~ s/<\s*([^>]+?)\s*>/ $_=$1; s,\s+, ,gs; "<$_>"; /gies;

  # handle the two types of <p> tags -- <p>...</p>, and just ...<p>
  $oldfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi;
  $newfile =~ s/<p( *[^>]*>.*?<\/p *[^>]*>)/\n<!!!p$1\n/gi;

  $oldfile =~ s/(<p *[^>]*>)/$1\n/gi;
  $newfile =~ s/(<p *[^>]*>)/$1\n/gi;

  $oldfile =~ s/<!!!p/<p/gi;
  $newfile =~ s/<!!!p/<p/gi;

  # put newline before these tags (thx Carsten Clasohm, again!)
  $oldfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi;
  $newfile =~ s/(<(?:table|tr|td|div|item) *[^>]*>)/\n$1/gi;
  # after these ones
  $oldfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi;
  $newfile =~ s/(<(?:br|hr|table|\/td|\/table|\/tr|\/div) *[^>]*>)/$1\n/gi;

  # remove newlines inside <a href> tags. Thx to Carsten Clasohm.
  1 while $oldfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis;
  1 while $newfile =~ s/(<a href=[^>]+>([^\n<]|<(?!\/a>))*)\n+/$1 /gis;

  if ($cleaned_pre_nls) {
    $oldfile =~ s/${pre_nl_tag}/\n/g; $oldfile =~ s/${pre_pre_tag}/<pre>/g;
    $oldfile =~ s/${pre_slashpre_tag}/<\/pre>/g;
    $newfile =~ s/${pre_nl_tag}/\n/g; $newfile =~ s/${pre_pre_tag}/<pre>/g;
    $newfile =~ s/${pre_slashpre_tag}/<\/pre>/g;
  }

  my $page = '';
  my $created_newf = 0;

  if ($diff eq 'MODULE') {
    # use the perl module implementation of diff instead!
    eval '
      use Algorithm::Diff qw(diff);

      my @chunk;
      my ($sign, $lineno, $text);
      my @f1 = split "\n", $oldfile;
      my @f2 = split "\n", $newfile;

      my $diffs = diff(\@f1, \@f2);

      if (@$diffs) {
	foreach $chunk (@$diffs) {
	  foreach $line (@$chunk) {
	    ($sign, $lineno, $text) = @$line;
	    if ($sign =~ /\+/) {
	      $page .= $text . "\n";
	    }
	  }
	}
      }
    1;' or die ("diff code eval failed: $@");

  } else {
    open (F1, "> $oldf") || warn "cannot write to $oldf\n";
    print F1 $oldfile; close F1;
    open (F2, "> $newf") || warn "cannot write to $newf\n";
    print F2 $newfile; close F2;
    $created_newf = 1;

    if ($diff ne '' && open (DIFF, "$diff $oldf $newf |")) {
      while (<DIFF>) {
	/^>/ || next;
	$page .= $';
      }
      close DIFF;		# ignore exit status -- exit 1 only means no diffs.

    } else {
      warn "cannot run Diff command \"$diff\", using entire page instead.\n";
      $page = $newfile;
    }
  }

  if ($main::debugdiffs) {
    open (F1, "> diff_out.tmp"); print F1 $page; close F1;
    warn "$diff $oldf $newf, breaking for debug"; &cleanexit;
  }

  if ($created_newf) {
    unlink $oldf; unlink $newf;
  }

  $page;
}

# ---------------------------------------------------------------------------

sub text_equals {
  my $t1 = shift;
  my $t2 = shift;
  $t1 =~ s/[\s\r\n]+/ /gs; $t1 =~ s/^\s+//; $t1 =~ s/\s+$//;
  $t2 =~ s/[\s\r\n]+/ /gs; $t2 =~ s/^\s+//; $t2 =~ s/\s+$//;
  ($t1 eq $t2);
}

# ---------------------------------------------------------------------------
# Strip a story page from StoryStart to StoryEnd.
# In addition, strip out non-story sidebar table items
# and carriage returns (they confuse plenty of regexps later).
#
sub strip_story {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;
  my $comment = shift;

  if (!defined $page) { return undef; }

  # ok, now strip the headers and footers
  my $pat = get_layout_param ('story_start', $baseurl, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("StoryStart pattern \"$pat\" found multiple times in page $url$comment\n");
    }
    if ($page =~ s#^.*?${pat}##gs) {
      $page =~ s#^[^<]*?>##gs;		# strip superfluous ends of tags
      if (defined fileno JOURNAL) { &journal ("pre_stripped", $&); }
    } else {
      &sitewarn("StoryStart pattern \"$pat\" not found in page $url$comment\n");
    }
  }

  $pat = get_layout_param ('story_end', $baseurl, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("StoryEnd pattern \"$pat\" found multiple times in page $url$comment\n");
    }
    if ($page =~ s#${pat}.*?$##gs) {
      $page =~ s#<[^>]*?$##gs;		# strip superfluous starts of tags
      if (defined fileno JOURNAL) { &journal ("post_stripped", $&); }
    } else {
      &sitewarn("StoryEnd pattern \"$pat\" not found in page $url$comment\n");
    }
  }

  # &smart_clean_table only operates on table items with size specifications.
  # TODO -- work out table sizes using images if possible.
  #
  $smart_clean_table_enabled = get_layout_param ('use_table_smarts', $baseurl, $url);

  if (0) {
  # We now use HTML::Parser.
  while ($page =~ /<(tr|td|table)/i) {
    my $didone = 0;

    $page =~ s/<\s*(td)\s*([^>]*)>(.*?)<\s*\/td\s*>/
	&smart_clean_table ($baseurl, $1, $2, $3, $baseurl, $url);
    /gies and $didone++;
    $page =~ s/<\s*(tr)\s*([^>]*)>(.*?)<\s*\/tr\s*>/
	&smart_clean_table ($baseurl, $1, $2, $3, $baseurl, $url);
    /gies and $didone++;
    $page =~ s/<\s*(table)\s*([^>]*)>(.*?)<\s*\/table\s*>/
	&smart_clean_table ($baseurl, $1, $2, $3, $baseurl, $url);
    /gies and $didone++;

    last if ($didone == 0);	# ran out of matching tag sets
  }
  $page =~ s/<!!!/</g;
  }

  if (1) {
    my $filter = new StripTablesFilter;
    $filter->parse ($page);
    $page = $filter->filtered_html();
  }

  $page =~ s/\r/ /g;	# strip CRs
  $page;
}

sub strip_front_page {
  my $url = shift;
  my $key = shift;
  my $baseurl = shift;
  my $page = shift;

  if (!defined $page) { return undef; }

  my $pat = get_layout_param ('links_start', $key, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("ContentsStart pattern \"$pat\" found multiple times in page $url\n");
    }
    ($page =~ s#^.*?${pat}##gs) ||
	&sitewarn("ContentsStart pattern \"$pat\" not found in page $url\n");
    $page =~ s#^[^<]*?>##gs;		# strip cut-in-half tags
  }

  $pat = get_layout_param ('links_end', $key, $url);
  if (defined $pat) {
    if ($page =~ /${pat}.*${pat}/) {
      &sitewarn("ContentsEnd pattern \"$pat\" found multiple times in page $url\n");
    }
    ($page =~ s#${pat}.*?$##gs) ||
	&sitewarn("ContentsEnd pattern \"$pat\" not found in page $url\n");
    $page =~ s#<[^>]*?$##gs;		# strip cut-in-half tags
  }

  $smart_clean_table_enabled = get_layout_param ('use_table_smarts', $key, $url);

  while ($page =~ /<(tr|td|table)/i) {
    my $didone = 0;
    $page =~ s/<\s*(td)\s*([^>]*)>(.*?)<\s*\/td\s*>/
	&smart_clean_table ($baseurl, $1, $2, $3, $baseurl, $url);
    /gies and $didone++;
    $page =~ s/<\s*(tr)\s*([^>]*)>(.*?)<\s*\/tr\s*>/
	&smart_clean_table ($baseurl, $1, $2, $3, $baseurl, $url);
    /gies and $didone++;
    $page =~ s/<\s*(table)\s*([^>]*)>(.*?)<\s*\/table\s*>/
	&smart_clean_table ($baseurl, $1, $2, $3, $baseurl, $url);
    /gies and $didone++;

    last if ($didone == 0);	# ran out of matching tag sets
  }
  $page =~ s/<!!!/</g;

  $page =~ s/\r/ /g;	# strip CRs
  $page;
}

# ---------------------------------------------------------------------------

sub get_headline {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;

  my $headline;

  if (defined $url_title{$url}) {
    $headline = &html_to_text ($url, $baseurl,
    		$url_title{$url}, $OUT_TEXT);
    &main::dbg ("StoryHeadline: (from RDF): $headline");

  } else {
    my $pat = get_layout_param ('head_pat', $baseurl, $url);
    if (defined $pat) {
      if ($page !~ m#${pat}#m) {
	&sitewarn("StoryHeadline pattern \"$pat\" not found in page $url\n");
      } elsif (defined $1) {
	$headline = &html_to_text ($url, $baseurl, $1, $OUT_TEXT);
	# &main::dbg ("StoryHeadline: $headline"); # logged later on anyway
      } else {
	&sitewarn("StoryHeadline pattern \"$pat\" contains no brackets!\n");
      }

    } elsif ($page =~ m#<meta name="PCTITLE" content="(.*)">#mi) {
      # try a fallback: search for PointCast headline tags
      $headline = &html_to_text ($url, $baseurl, $1, $OUT_TEXT);
      &main::dbg ("StoryHeadline (default, PointCast): $headline");
    }
  }

  $headline;
}

# ---------------------------------------------------------------------------

sub get_story_turnover_links {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;

  my @turnoverlinks = ();
  my $followlinks = get_layout_param ('story_follow_links', $baseurl, $url);

  while (1) {
    if ($hit_file_size_limit) { last; }

    if (
      $page =~ s/<a\s+[^>]*href=\s*\"([^\">]+)\"[^>]*>(.+?)<\/a>//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*\'([^\'>]+)\'[^>]*>(.+?)<\/a>//is
      ||
      $page =~ s/<a\s+[^>]*href=\s*([^\s>]+)[^>]*>(.+?)<\/a>//is
      )
    {
      my $link = $1;
      my $txt = $2;

      $link =~ s/^(?:\"|\'|%22)*//; $link =~ s/(?:\"|\'|%22)*$//;
      if ($followlinks) {
	push (@turnoverlinks, $link);

      } elsif ($txt =~ m,(more|next|\d+ of \d+|&gt;&gt;),i) {
	my $urlguts = '.';
	($baseurl =~ /^http:\/\/\S+\.([^\.\/]+\.[^\.\/]+\/).*$/) and
	    ($urlguts = $1);

	if (($txt !~ /[a-z0-9] [a-z0-9]+ [a-z0-9]+ [a-z0-9]/i) # 5 or more words
	    && (length ($txt) < 15)
	    && $link =~ m/$urlguts/)
	{
	  push (@turnoverlinks, $link);
	  $txt =~ s/[\n\r]+/ /g;
	  &verbose ("(Following 'next page' link: \"$txt\")");
	}
      }
      next;
    }

    last;		# no more links available
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

sub get_contents_turnover_links {
  my $url = shift;
  my $key = shift;
  my $page = shift;

  my $followlinks = get_layout_param ('links_follow_links', $key, $url);
  if (!$followlinks) {
    return ();
  }

  my @turnoverlinks = ();

  while ($page =~ s,<a\s+[^>]*href=\s*(?:\"|\'|%22)?([^>]+)(?:\"|\'|%22)?>(.+?)</a>,,is)
  {
    my $link = $1;
    my $txt = $2;

    push (@turnoverlinks, $link);
    # we don't do the automatic "more/next/page x of y" stuff
    # that we do with the story pages
  }

  @turnoverlinks;
}

# ---------------------------------------------------------------------------

sub remove_an_ext_link {
  my ($link, $text, $ahref, $posthref) = @_;

  if (!&writing_html) {
    return $text;
  }

  if (defined ($main::output_links_snarfed {$link})
   || ($link =~ /__HASH__/ && defined ($main::output_links_snarfed {$`}))
              || $main::nolinkrewrite)
  {
    $ahref.$link.$posthref.$text."</a>";
  } else {
    &main::dbg ("Removing non-snarfed link: $link (\"$text\")");
    "<u>".$text."</u>";		# without <a href=...> </a>
  }
}

sub remove_external_links {
  local ($_) = $_[0];

  #&dbg (join(' ', sort keys %main::output_links_snarfed));

  s/(<a\s+[^>]*href=\s*[\"\'])([^\"\']+)([\"\'][^>]*?>)(.*?)<\/a>/
	  &remove_an_ext_link ($2, $4, $1, $3);
      /gies;
  $_;
}

# We could do this smarter, but it looks really gross when converted to
# DOC format -- and this tool is primarily for that conversion. Sorry!
# This also works well for iSilo, because iSilo's rendering of <pre> text
# is pretty rotten.
#
sub clean_preformatted_text {
  my $txt = shift;
  $txt =~ s/[ \t]+\n/\n/g;
  $txt =~ s/<(|\/)(pre|code)>//g;	# strip extra <pre> tags!

  # convert blank lines to a paragraph separator.
  $txt =~ s/\n{1,}\n/<p>\n\n/g;

  # The idea with this one is to add a <br> at the end of lines shorter
  # than 50 columns, and conversely to allow lines longer than 50 cols to
  # run into the next line as if they were part of a paragraph.  I'm not
  # sure about it, but a lot of <pre> sites are just copies of emails, so
  # it can make them look a lot better, since the Palm's screen is a
  # lot narrower than 80 columns (which is what most <pre> pages aim for).
  # REVISIT - Make this a .site file parameter?
  $txt =~ s/\n\s*(.+[<>].+)\s*\n/<br>\n$1<br>\n/g;
  $txt =~ s/\n\s*([^\n]{1,50})\s*\n/\n$1<br>\n/g;

  $txt =~ s/[ \t]+/ /g;
  $txt;
}

# Work out if we should strip table items based on their size -- well,
# their width at least.
#
sub smart_clean_table {
  local ($_);
  my $baseurl = shift;
  my $tagname = shift;
  my $tags = shift;
  my $contents = shift;
  my $key = shift;
  my $url = shift;

  if ($smart_clean_table_enabled) {
    $_ = " $tags "; s/\s+/ /g; s/ = /=/g; s/"//g;
    my $omit = 0;

    if (/ width=(\d+) /i) {
      if ($1+0 < 250) { $omit = 1; }
    } elsif (/ width=(\d+)% /i) {
      if ($1+0 < 40) { $omit = 1; }
    }
    if ($omit) { &main::dbg ("table item <td$_> omitted"); return ' '; }
  }
  "<!!!$tagname ".$tags.">".$contents."<!!!/$tagname>";
}

sub translate_link {
  my ($url, $baseurl, $link, $text, $ahref, $posthref) = @_;

  if (!&writing_html) {
    return $text;
  }
  if (!defined $ahref) { $ahref = "<a href="; }
  if (!defined $posthref) { $posthref = ">"; }

  my $nolink;
  if ($text =~ /\S/) {
    $nolink = "<u>".$text."</u>";
  } else {
    $nolink = " ";		# sometimes there's no text!
  }

  $link = &AbsoluteURL ($url, $link);
  return $nolink if ($link !~ /^(http|file):/i);	# only supported links

  if ($main::nolinkrewrite) {
    return $ahref."\"".$link."\"".$posthref.$text."</a>";
  }

  # translate to printable version first, in case the StoryURL pattern
  # only covers the printable style.
  $link = &make_printable ($baseurl, $link, 0);

  # Is the link one that we will be downloading? If not, just de-linkify
  # it. 1-level sites never have active links so we can just assume
  # the links should not be links.
  my $limitto = $story_limit_to{$baseurl};

  if ((!match_url ($link, $limitto)
    	#|| $levels{$baseurl} < 0	# not sure why this is here. TODO
	)
    	&& &URLWithoutAnchor ($link) ne &URLWithoutAnchor ($baseurl))
  {
    # check the contents/issue levels as well.
    my $ok = 0;

    for ($lev = $levels{$baseurl}; $lev >= 0; $lev--) {
      my $key = "$lev $baseurl";
      $limitto = $links_limit_to{$key};
      if (defined $limitto) {
	if (match_url ($link, $limitto)) { $ok = 1; last; }
      }
    }

    if ($ok == 0) {
      if ($main::warn_about_external_links) {
	&main::dbg ("External link not translated: $link");
      }
      return $nolink;

      # REVISIT -- provide links at end of stories
    }
  }
  #&main::dbg ("Translating link: $link");
 
  # Note that we always put in quotes around the URL.
  # &remove_external_links, which is run later, requires this (and anyway
  # it makes for better HTML).
  #
  if ($main::fileperpage) {
    my ($fname, $relative) = &href_to_multipage_anchor ($link);
    $ahref."\"".$relative."\"".$posthref.$text."</a>";
  } else {
    my $anchor = &href_to_singlepage_anchor ($link);
    $ahref."\"#".$anchor."\"".$posthref.$text."</a>";
  }
}

# try to preserve images used as capital letters starting a story. NYTimes
# does this.
sub clean_inline_images {
  my $url = shift;
  my $baseurl = shift;
  my $tag = shift;

  if (defined ($use_alt_tags{$baseurl}) && 
	$tag =~ /(?:^|\s)src=\"?([^\"> ]+)\"?(?:$|\s)/is)
  {
    my $src = $1;

    if ($tag =~ /(?:^|\s)alt=\"?([^\"> ]+)\"?(?:$|\s)/is) {
      my $alt = $1;

      $src = &AbsoluteURL($url,$src);
      if (match_url ($src, $use_alt_tags{$baseurl})) {
	&main::dbg ("using alt tag \"$alt\" for img: $src");
	return $alt;
      } else {
	# &main::dbg ("not using alt tag \"$alt\" for img: $src");
      }
    }
  }

  if ($tag =~ /(?:^|\s)alt=\"?([A-Za-z0-9])\"?(?:$|\s)/is) {
    &main::dbg ("converting one-letter img to letter: $1");
    return $1;
  }

  if ($main::allowimgs && defined ($imageurl{$baseurl})) {
    if ($tag =~ /(?:^|\s)src=\"?([^\"> ]+)\"?(?:$|\s)/is) {
      my $src = $1;

      $src = &AbsoluteURL($url,$src);
      if (match_url ($src, $imageurl{$baseurl})) {
	&main::dbg ("keeping img: $src");
	$tag =~ s/(?:^|\s)src=\"?[^\"> ]+\"?(?:$|\s)/ /gis;
	$relative = &download_img ($baseurl, $src, $tag);

	return "<img src=\"".$relative."\" $tag>";
      }
    }
  }

  " ";
}

sub download_img {
  my $baseurl = shift;
  my $url = shift;
  my $tag = shift;


  my $type = '.gif';
  if ($url =~ /\.jp[eg]+/i) { $type = '.jpg'; }
  my ($fname, $relative) = &href_to_multipage_anchor ($url, $type);

  if (!$main::output_links_snarfed{$url}) {
    &verbose ("Image: $url");

    my $img = &get_img ($baseurl, $url, 0);
    if (!defined $img) {
      &verbose ("Skipping (get_img returned nothing): $url");
      return;
    }
    if ($got_intr_flag) { return ""; }

    &cache_page ($url, $img);
    open (OUT, "> $fname") or warn
	  (warn ("failed to write to $fname!\n"), return "");
    binmode OUT;
    print OUT $img;
    close OUT or warn ("failed to write to $fname!\n");

    &add_snarfed_link ($url);
    &up_file_size ($url, (-s $fname), "image");
  }

  $relative;
}

sub html_to_text {
  my $url = shift;
  my $baseurl = shift;
  my $page = shift;
  my $format = shift;

  if ((defined $site_format{$baseurl}) && ($site_format{$baseurl} eq 'rss')) {
    # Convert the RSS formatting into a nice display, for the index page.
    $page =~ s,<channel>(.*?)<title>(.*?)<\/title>(.*?)<\/channel>,<h2>$2<\/h2> $1 $3,gis;

    my $link;
    $page =~ s/<link>(.*?)<\/link>/
      $link = $1; $link =~ s,^.*<url>(.*)<\/url>.*$,$1,g;
      $link = &AbsoluteURL($url,$link);
      '(<a href='.$link.'>'.$link.'<\/a>)';
    /gies;

    $page =~ s,<title>(.*?)<\/title>,<b>$1<\/b> ,gis;
    $page =~ s,<item>,<p>,gis; $page =~ s,<\/item>,<\/p>,gis;

    # the description is converted for RSS 0.91 sites -- the "fat" format
    $page =~ s,<description>(.*?)<\/description>,$1 ,gis;
  }


  # strip tags we know we don't want
  $page =~ s/<head(?:\s+[^>]+|)>.*?<\/head>/ /gis;
  $page =~ s/<(?:html|body)(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<\/(?:html|body)>/ /gis;
  $page =~ s/<iframe(?:\s+[^>]+|)>.*?<\/iframe>/ /gis;
  $page =~ s/<ilayer(?:\s+[^>]+|)>.*?<\/ilayer>/ /gis;
  $page =~ s/<layer(?:\s+[^>]+|)>.*?<\/layer>/ /gis;
  $page =~ s/<\/?frame(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<\/?frameset(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<script(?:\s+[^>]+|)>.*?<\/script>/ /gis;
  $page =~ s/<style(?:\s+[^>]+|)>.*?<\/style>/ /gis;	# not yet
  $page =~ s/<!--.*?-->/ /gis;			# MSIE-style comments
  $page =~ s/<!--[^>]+>/ /gis;			# Netscape-style comments
  $page =~ s/<form(?:\s+[^>]+|)>.*?<\/form>/ /gis;
  $page =~ s/<image(?:\s+[^>]+|)>.*?<\/image>/ /gis;	# RDF tag
  $page =~ s/<channel(?:\s+[^>]+|)>.*?<\/channel>/ /gis;	# RDF tag
  $page =~ s/<map(?:\s+[^>]+|)>.*?<\/map>/ /gis;
  $page =~ s/<applet(?:\s+[^>]+|)>.*?<\/applet>/ /gis;
  $page =~ s/<item(?:\s+[^>]+|)>.*?<\/item>/ /gis;	# some RDF items
  $page =~ s/<link(?:\s+[^>]+|)>.*?<\/link>/ /gis;	# some RDF items
  $page =~ s/<title(?:\s+[^>]+|)>.*?<\/title>/ /gis;	# some RDF items
  #$page =~ s/<(?:table|td|tr)(?:\s+[^>]+|)>/ /gis;	# TO INVESTIGATE
  #$page =~ s/<\/(?:table|td|tr)>/ /gis;	# TO INVESTIGATE
  $page =~ s/<meta\s+[^>]+>/ /gis;

  # Handle inline images.
  $page =~ s/<img\s+([^>]*)>/&clean_inline_images($url, $baseurl, $1)/gies;

  # try to add closing tags, since we may have stripped off the original
  # ones. This allows us to return formatting to the baseline before
  # going on to the next page in a multi-page site.
  #
  if ($main::add_closing_tags) {
    my $tag;
    foreach $tag (qw(blockquote b h1 h2 h3 h4 h5 h6 div em
			  i u code small big strong pre li ul ol font a td
			  tr table))
    {
      if ($page =~ m#^.*<\s*${tag}(?:\s+[^>]+|)\s*>#is && $' !~ m#<\s*/\s*${tag}\s*>#is) {
	&main::dbg ("re-adding stripped closing tag: </$tag>");
	$page .= "</".$tag.">";
      }
    }
  }

  # convert <pre> text to proper HTML, it displays better.
  $page =~ s/<pre>(.*?)<\/pre>/&clean_preformatted_text($1);/gies;
  $page =~ s/<code>(.*?)<\/code>/&clean_preformatted_text($1);/gies;

  # strip all existing line breaks, they will just confuse matters
  # when we convert to text or HTML. It's also easier to do proper diffs
  # when we control the placement of newlines.
  $page =~ s/[\r\n]+/ /gs;

  if ($format == $OUT_DOC) {
    # Create DOC bookmarks at <a name> tags
    # From Brian Lalor <blalor@hcirisc.cs.binghamton.edu>
    # via Christopher Heschong's <chris@screwdriver.net>
    # webpage-to-prc converter. Nice one lads, good trick!
    $page =~ s/<a\s+name.*?>/$main::bookmark_char /gis;
  }

  if ($format == $OUT_HTML) {
    if (!$main::fileperpage) {
      $one_page_anchor = &href_to_singlepage_anchor ($url);
      $one_page_anchor =~ s/[^-_A-Za-z0-9]/_/g;
      $page =~ s{<a\s+name=(\"?)(.*?)(\"?)>}
      		{<a name=$1${one_page_anchor}__HASH__$2$3>}gis;
    }

    # note the conversion of href= to href!!!=. This stops the second
    # substitution from screwing up the output of the first one!
    $page =~ s/(<a\s+[^>]*href)=\s*(?:\"|%22)([^\">]+)(?:\"|%22)([^>]*?>)(.*?)<\/a>/
	    &translate_link ($url, $baseurl, $2, $4, $1.'!!!=', $3);
    	/gies;
    $page =~ s/(<a\s+[^>]*href)=\s*([^>\s\n]+)([^>]*>)(.*?)<\/a>/
	    &translate_link ($url, $baseurl, $2, $4, $1.'!!!=', $3);
    	/gies;
    $page =~ s/href!!!=/href=/gis;	# back to normal

    # This works around a bug (I think) in iSilo that makes Wired News
    # indexes look rotten. Shouldn't be harmful anyway.
    $page =~ s/<br>\s*\&nbsp;\s*<br>/<br><br>/gis;

    # clean up useless tags and whitespace at the start and end of the text.
    1 while $page =~ s,^\s*<(?:br|hr|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>,,gis;
    1 while $page =~ s,<(?:br|hr|td|table|p|tr|h\d|div)\s*[^>]*>\s*$,,gis;

    # remove now-empty table items, text markup, paragraphs etc.  the
    # ordering of the tags in the foreach loop is important; strip the
    # "smallest" ones first. (actually, don't do td's, they can
    # affect the formatting quite a lot.)
    #
    # TODO - this is currently offline - some HTML will cause an infinite
    # loop in perl's regular expression implementation.
    #
    if ($main::strip_empty_tag_sets) {
      foreach $tag (qw(b i u em font small big strong code div li ul ol
	  blockquote h1 h2 h3 h4 h5 h6 pre table))
      {
	$page =~ s{<\s*${tag}(?:\s+[^>]*|\s*)>(?:\s+|<\s*br\s*>|\&nbsp;)*<\s*\/\s*${tag}\s*>}{
	  &main::dbg ("stripping now-empty tag set: $&");
	}gies;
      }
    }

    # since we're rendering to HTML, line breaks are OK. Put them back in!
    $page =~
      s,(<(?:br|p|hr|table|td|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>),$1\n,gis;

    # strip colors.
    $page =~ s,(<\S+\s*[^>]*\s)(?:bg|fg|border|)color=[\"']?[-_\#0-9a-z]+[\"']?,$1,gis;
  }

  if ($format == $OUT_DOC || $format == $OUT_TEXT) {
    # We're converting to DOC or text format, so we need to do a lot
    # more work here.

    # a sidebar enclosed by a table? separate it from the rest of the text.
    $page =~ s/<\/tr>/\n\n/gis;
    $page =~ s/<\/table>/\n\n/gis;	# end of <table>
    $page =~ s/<\/pre>/\n\n/gis;	# end of <pre> text
    $page =~ s/<(\/h\d|h\d)(\s+[^>]+|)>/\n\n/gis;	# headings
    $page =~ s/<\/?blockquote(\s+[^>]+|)>/\n\n/gis;	# quotes
    $page =~ s/<hr(\s+[^>]+?|)>/\n\n/gis;	# horiz lines
    $page =~ s/<br(\s+[^>]+?|)>/\n/gis;	# end-of-line markers
    $page =~ s/<li(\s+[^>]+?|)>/\n/gis;	# list items

    $page =~ s/<\/?p(\s+[^>]+?|)>/\n\n/gis;
    # don't worry, multiple blank lines are sorted later

    $page =~ s/<\/td>/\n/gis;		# end-of-table-item

    1 while ($page =~ s/<[^>]+?>//gs);	# trim all other tags

    decode_entities($page);

    # trim multiple (blank) bookmarks
    $page =~ s/($main::bookmark_char\s+){2,}/$main::bookmark_char /gs;
  }

  # Convert some HTML entities that the viewers can't handle.
  $page =~ s/\&apos;/\'/gi;	# confuses iSilo
  $page =~ s/\&\#150;/-/gi;	# bad Industry Standard - no cookie!

  $page =~ s/[ \t]+/ /g;	# canonicalise down to one space
  $page =~ s/\n /\n/gs;		# leading w/s on each line
  $page =~ s/\n{3,}/\n\n/gs;	# too many blank lines
  $page =~ s/^\s+//gs;		# blank space at start of story
  $page =~ s/\s+$//gs;		# blank space at end of story

  $page;
}

# ---------------------------------------------------------------------------

sub check_for_oldest {
  my $url = shift;

  my $fullurl = $url; $url = &URLWithoutAnchor ($url);
  $url =~ m,http://(\S+?)/,i; my $urlhost = $1;
  return unless defined ($urlhost);

  #&main::dbg ("checking to see if $url is oldest at its site: modtime=".
  	#(defined $last_modtime{$url} ? $last_modtime{$url} : "unknown)"));
  if (defined $last_modtime{$url}) {
    if (defined($main::oldest_already_seen_this_site{$urlhost})
    	? $main::oldest_already_seen_this_site{$urlhost} > $last_modtime{$url} : 1)
    {
      &main::dbg ("oldest link seen at $urlhost $url: modtime=".$last_modtime{$url});
      $main::oldest_already_seen_this_site{$urlhost} = $last_modtime{$url};
    }
  }
}

# ---------------------------------------------------------------------------

sub pre20_generic_cache_fname {
  my $dir = shift;
  my $url = shift;

  if (!defined $dir) { return undef; }

  $url = &URLWithoutAnchor ($url);
  $url =~ s/[^-_A-Za-z0-9]/_/g;

  if ($main::use_hashes_for_cache_filenames) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...
    if ($url =~ /^(.+)(.{16})$/) {
      my $hash = unpack ("%16C*", $1);
      $url = sprintf ("%4x_%s", $hash, $2);
    }
  }

  $url = $dir.$slash.$url;	# put it in the directory/folder
  $url;
}

sub upgrade_cache_directory_if_needed {
  local ($_);
  my $olddir = shift;
  $upg_cache_newdir = $olddir;		# in place
  my $ver = 0;

  if (open (IN, "<".$olddir.$slash."cache.cf")) {
    while (<IN>) { /^version (\d+)/ && ($ver = $1+0); }
  }

  if ($ver < 2) {
    warn "Upgrading cache directory to version 2 format...\n";
    $dompat = "(?:com|org|net|gov|mil|int|edu|ie|uk|hu|hr|fr|".
		 "us|de|il|mx|br|nl|se|pl|no|fi|in|gr|be|za)";

    sub upg_cache_file_to_2 {
      # skip dirs, they're part of the new structure.
      if (/_/ && -d $_) { $File::Find::prune = 1; return; }

      return unless (-f _);
      if ( (/^([a-z0-9]+)___([-_a-z0-9]+?_${dompat}(?:_\d+|))_(.*)$/i)
	|| (/^([a-z0-9]+)___(\d+_\d+_\d+_\d+(?:_\d+|))_(.*)$/i)
	|| (/^(file)___()(.*)$/i))
      {
	my ($proto, $site, $path) = ($1, $2, $3);
	$site =~ s/_$//;
	my $url = $proto."://".$site."/".$path;
	my $newname = &generic_cache_fname ($upg_cache_newdir, $url);

	rename ($_, $newname)
		or warn "rename \"$_\" -> \"$newname\" failed!\n";

      } else {
	warn "Didn't know how to upgrade cache file, ignored: $_\n";
      }
    }

    find(\&upg_cache_file_to_2, $olddir);

    open (OUT, ">".$olddir.$slash."cache.cf");
    print OUT "version 2\n"; close OUT;
  }
}

sub generic_cache_fname {
  my $dir = shift;
  my $url = shift;

  if (!defined $dir) { return undef; }

  $url = &URLWithoutAnchor ($url);		# trim #anchors
  
  my $site;
  my $path;
  if ($url =~ m,^([\w]+://[^/]+)\/(.+)$,) {
    $site = $1; $path = $2;
  } else {
    $site = $url; $path = '_'; $site =~ s/\/$//;
  }
  $site =~ s,^http://,,i; $site =~ s,^www\.,,i;	# trim common stuff
  $site =~ s/[^-_A-Za-z0-9]/_/g;
  $path =~ s/[^-_A-Za-z0-9]/_/g;

  if ($main::use_hashes_for_cache_filenames) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...
    if ($site =~ /^(.+)(.{16})$/) {
      $site = sprintf ("%4x_%s", unpack ("%16C*", $1), $2);
    }
    if ($path =~ /^(.+)(.{16})$/) {
      $path = sprintf ("%4x_%s", unpack ("%16C*", $1), $2);
    }
  }

  $site = $dir.$slash.$site;
  if (!-d $site) {
    mkdir ($site, 0777) or die "failed to mkdir '$site'\n";
  }
  $site .= $slash.$path;
  $site;
}

sub cachefilename { &generic_cache_fname ($cachedir, shift); }
sub sharedcachefilename { &generic_cache_fname ($sharedcache, shift); }
sub newcachefilename { &generic_cache_fname ($newcachedir, shift); }

sub get_cached_page {
  my $url = shift;
  my $is_diff_page = shift;
  my $cachefile = &cachefilename ($url);

  if (!defined $cachefile) { return undef; }

  # if -refresh is on, do not return any cached pages.
  if ($main::refresh) { return undef; }

  if (open (IN, "< $cachefile")) {
    binmode IN; my $cachedpage = join ('', <IN>); close IN; $cachedpage;
  } else {
    undef;
  }
}
sub get_cached_page_for_diff { &get_cached_page (@_, 1); }

sub get_page {
  &http_get (@_, 0);	# only text content types
}

sub get_img {
  &http_get (@_, 1);	# allow binary files
}

sub http_get {
  my $baseurl = shift;
  my $url = shift;
  my $is_dynamic_html = shift;
  my $allow_binary = shift;
  my $page = '';

  $url = &URLWithoutAnchor ($url);
  my $cachefile = &cachefilename ($url);
  my $cachedpage = &get_cached_page ($url);
  &check_for_oldest ($url);
  my $lastmod;

  if (defined $cachefile && defined $cachedpage) {
    if ($is_dynamic_html == 0) {
      &main::dbg("cached version exists");
      return $cachedpage;

    } elsif (defined (-M $cachefile)
    	&& -M _ < $main::cached_front_page_lifetime
	&& -M _ > 0)		# just make sure the clock is sane
    {
      &main::dbg("cached version is new enough: ".(-M $cachefile)." days");
      return $cachedpage;

    } elsif ($main::use_only_cache) {
      &main::dbg("-fromcache switch is on, using cached version");
      return $cachedpage;
    }
  }

  # see if we have it in the shared cache
  if (defined $sharedcache) {
    $cachedpage = undef;
    $cachefile = &sharedcachefilename ($url);
    if (defined $cachefile && (open (IN, "< $cachefile"))) {
      binmode IN; $cachedpage = join ("", <IN>); close IN;
      if ($cachedpage =~ s/^<!-- lastmod: (\d+) -->//) {
	$lastmod = $1+0;
      }
    }

    if (defined $cachefile && defined $cachedpage) {
      if ($is_dynamic_html == 0) {
	&main::dbg("shared-cache version exists");
	if (defined $lastmod)
	      { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); }
	return $cachedpage;

      } elsif (defined (-M $cachefile)
	  && -M _ < $main::cached_front_page_lifetime && -M _ > 0)
      {
	&main::dbg("shared-cache version is new enough: ".(-M $cachefile)." days");
	if (defined $lastmod)
	      { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); }
	return $cachedpage;

      } elsif ($main::use_only_cache) {
	&main::dbg("-fromcache switch is on, using shared-cache version");
	if (defined $lastmod)
	      { $last_modtime{$url} = $lastmod; &check_for_oldest ($url); }
	return $cachedpage;
      }
    }

    undef $cachedpage;	# if it didn't pass those tests, don't keep it!
  }

  if ($main::use_only_cache) {
    &main::dbg("-fromcache switch is on, not doing HTTP request");
    return undef;
  }

  if (!$allow_binary && ($url =~ /\.(ra|ram|wav|jpeg|jpg|gif|mov|zip|rar)$/i
    	|| $url =~ /\.(tar|tgz|gz|tbz|bz2|rpm|swf|mpeg|mpg)$/i))
  {
    &main::dbg("not retrieving non-HTML content: $url");
    return undef;
  }

  my $resp;
  my $retries;

  for ($retries = 0; $retries < 4; $retries++) {
    if ($got_intr_flag) { return undef; }

    my $req = new HTTP::Request ('GET', $url);	# REVISIT - support POST
    $req->header ("Accept-Language" => "en",
	  "Accept-Charset" => "iso-8859-1,*,utf-8");

    # cookie_jar will assume that it's a HTTP request. Reasonable enough
    # I suppose...
    if ($url =~ /^http:/i) {
      $main::cookie_jar->add_cookie_header($req);
    }

    $resp = undef;
    $cmd = '$resp = $main::useragent->request ($req);';

    my $timeout = 10;	# minutes
    undef $ScoopHTTP::UserAgent::last_realm;

    # REVISIT -- implement timeout for Win32 perl
    if (&Portability::MyOS eq 'UNIX') {
      eval '
	local $SIG{"ALRM"} = sub { die "alarm\n" };
	alarm $timeout*60; { ' . $cmd. ' } alarm 0;
      ';
    } else {
      eval $cmd;
    }

    die if $@ && $@ ne "alarm\n"; if ($@) {
      &sitewarn ("HTTP GET timed out, $timeout minutes without a response.");
      &got_intr;
    }
    if ($got_intr_flag) { return undef; }

    if (!$resp->is_success) {
      &sitewarn ("HTTP GET failed: ".$resp->status_line." ($url)");
      if ($resp->status_line =~ /^401/ && defined ($ScoopHTTP::UserAgent::last_realm))
      {
	if (defined ($ScoopHTTP::UserAgent::site_logins{$ScoopHTTP::UserAgent::last_realm}))
	{
	  &verbose ("Deleting incorrect username and password for this realm.");
	  undef $ScoopHTTP::UserAgent::site_logins{$ScoopHTTP::UserAgent::last_realm};
	  undef $ScoopHTTP::UserAgent::site_passes{$ScoopHTTP::UserAgent::last_realm};
	  next;		# re-request page
	} else {
	  &verbose ("Cannot read page, it requires a username and password.");
	}
      }
      return undef;
    }

    if (!$allow_binary && (defined($resp->content_type) &&
      	$resp->content_type ne '' && $resp->content_type !~ /^(text\/|multipart\/)/))
    {
      &verbose ("Non-text content: Content-Type: ".$resp->content_type.".");
      return undef;
    }

    if (defined $need_login_url{$baseurl}) {
      if (match_url ($resp->base, $need_login_url{$baseurl})) {
	&verbose ("Page requires a username and password, requesting...");
	&get_basic_credentials ($main::useragent, $baseurl, $url);
	next;		# re-request page
      }
    }

    $page = $resp->content;
    # handle (ugh) Javascript or meta-tag redirects
    if ($page =~ /meta\s+http-equiv=\"refresh\"\s+content=\"[^\"]*url=([^\"\;]+)[\"\;]/is)
    {
      $url = $1;
      &verbose ("Redirected by META tag to: $url");
      next;
    }

    last;		# break out of for loop
  }

  if (defined $resp->last_modified) {
    $lastmod = $resp->last_modified;
    &main::dbg ("last-modified time for $url: $lastmod (".&time2datestr($lastmod).")");

    if (defined $last_modtime{$url} && defined($lastmod)
      && $lastmod <= $last_modtime{$url} && !$main::refresh &&
      !$allow_binary)
    {
      &verbose ("Skipping (no mod since last download): $url");
      $last_modtime{$url} = $lastmod+0; &check_for_oldest ($url);
      return undef;
    }

  } else {
    &main::dbg ("last-modified time for $url: not provided");
    $lastmod = time;
  }
  $last_modtime{$url} = $lastmod; &check_for_oldest ($url);

  if (!$is_dynamic_html && defined $cachedpage && $cachedpage eq $page
    	&& !$main::refresh && !$allow_binary)
  {
    &verbose ("Skipping (HTML has not changed): $url");
    return undef;
  }
  $page;
}

# ---------------------------------------------------------------------------
 
sub cache_page {
  my ($url, $page, $cachelater) = @_;

  $url = &URLWithoutAnchor ($url);
  my $cachefile = &newcachefilename ($url);

  # if this page is the latest version of a diffed page, don't cache it
  # immediately, as it will mean lost stories if we're interrupted.
  # Instead save the filename for renaming when the run finishes.
  if (defined $cachelater && $cachelater == 1) {
    my $tmpname = $cachefile.'.tmp';
    $main::caches_to_rename{$tmpname} = $cachefile;
    $cachefile = $tmpname;
  }

  open (C1OUT, "> $cachefile"); binmode C1OUT; print C1OUT $page; close C1OUT;

  if (defined $sharedcache) {
    $cachefile = &sharedcachefilename ($url);
    open (C2OUT, "> $cachefile"); binmode C2OUT;
    if (defined $last_modtime {$url}) {
      # cache the last-modified time of this page as well.
      print C2OUT "<!-- lastmod: ",$last_modtime{$url}," -->\n";
    }
    print C2OUT $page; close C2OUT;
  }
  $page;
}
 
sub cache_page_later { &cache_page ($_[0], $_[1], 1); }

# ---------------------------------------------------------------------------

sub write_as_story {
  local ($_);
  my ($is_front, $url, $baseurl, $page, $headline, $upindex) = @_;

  my $fullurl = $url;
  $url = &URLWithoutAnchor ($url);

  my $sitename = $name{$baseurl};
  if (!defined $sitename) { $sitename = $url; }

  my $proc = get_layout_param ('story_postproc', $baseurl, $url);
  if (defined $proc) {
    my $bookmark_char = $main::bookmark_char;	# convenience for PostProc
    $_ = $page;
    if (!eval $proc."; 1;") {
      &sitewarn("StoryPostProc failed: $@");
      # and keep the original $page
    } elsif (!defined $_) {
      &sitewarn("StoryPostProc failed (\$_ is undefined): $@");
      # and keep the original $page
    } else {
      $page = $_;
    }
  }

  my $outtext = '';
  my $one_page_anchor;

  if (&writing_html) {
    $one_page_anchor = &href_to_singlepage_anchor ($url);
    if ($main::writeheader) {
      $outtext .=
	"\n\n<hr><i>$sitename: $url</i><br><a name=\"$one_page_anchor\">\n"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($main::current_story_index-1)."\">&lt;&lt;</a>]"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($upindex)."\">^</a>]"
	. "[<a href=\"__SITESCOOPER_STORY_"
	.      ($main::current_story_index+1)."\">&gt;&gt;</a>]<br>\n\n";

    } else {
      $outtext .= "<a name=\"$one_page_anchor\">\n";
    }
    $outtext .= $page;

  } else {
    $outtext .= "------------\n";
    if ($main::writeheader) {
      $outtext .= "$sitename: $url\n\n";
    }
    if (&writing_doc) {
      if (defined $headline) {
	&verbose ("(Headline: $headline)");
	$outtext .= "$main::bookmark_char $headline\n";
      } else {
	# use the first line in the story instead
	$outtext .= "$main::bookmark_char ";
      }
    }

    foreach $_ (split (/\n/, $page)) {
      if (&writing_text) {
	# wrap each line after 70 columns
	while (s/^(.{70}\S*)\s+//) {
	  $outtext .= $1."\n";
	}
      }
      $outtext .= $_."\n";
    }

    $outtext .= "\n\n\n";
  }

  if ($main::fileperpage) {
    my ($fname, $relative) = &href_to_multipage_anchor ($url);

    if ($is_front && $fullurl eq $baseurl) {
      # this is the front page, just append it to the index file
      $main::output_file{'MAIN'} .= $outtext;

    } else {
      if (&writing_html) {
	$main::output_file{$fname} = "<html><head></head><body>".
			  $outtext."</body></html>";

	# ensure we have links to the stories from the index page!
	# (breaks for 3-level sites, so leave it out.)
	#if ($main::output_file{'MAIN'} !~ /href=\"?$relative/) {
	  #if (!defined $headline) { $headline = '(no headline found)'; }
	  #$main::output_file{'MAIN'} .= "<a href=$relative>$headline</a><br>\n";
	#}
      }
    }

    if (&writing_html) {
      &add_snarfed_link ($relative);
      $main::output_story_urls[$main::current_story_index] = $relative;

      if ($fullurl ne $url) {
	my ($ffname, $frelative) = &href_to_multipage_anchor ($fullurl);
	&add_snarfed_link ($ffname);
	&add_snarfed_link ($frelative);
      }

    } else {
      $main::output_file{$fname} .= $outtext;
    }

  } else {
    $main::output_file{'MAIN'} .= $outtext;

    if (&writing_html) {
      &add_snarfed_link ('#'.$one_page_anchor);
      $main::output_story_urls[$main::current_story_index] = '#'.$one_page_anchor;

      if ($one_page_anchor =~ /__HASH__/) {
	&add_snarfed_link ('#'.$`);
      }
    }
  }

  $main::current_story_index++;
  &add_snarfed_link ($url);
  if ($fullurl ne $url) { &add_snarfed_link ($fullurl); }
  &up_file_size ($url, length($outtext), "story");
  $stories_found++;
}

# ---------------------------------------------------------------------------

sub up_file_size {
  my $url = shift;
  my $siz = shift;
  my $typetxt = shift;

  $file_size += $siz;
  &dbg ("$typetxt written, ". ($file_size/1024)." Kb, limit is ".
  			$main::filesizelimit." Kb");

  if ($file_size/1024 >= $main::filesizelimit) {
    $hit_file_size_limit = 1;
  }
}

# ---------------------------------------------------------------------------

sub add_snarfed_link {
  my $url = shift;
  #&main::dbg ("Tracking snarfed link: $url");		# J M D
  $main::output_links_snarfed{$url} = 1;
}

# ---------------------------------------------------------------------------

sub href_to_multipage_anchor {
  my $url = shift;
  my $type = shift;

  if (!defined $type) { $type = '.html'; }

  my $anchor = &URLAnchor ($url);
  $url = &URLWithoutAnchor ($url);

  if (!defined $page_to_tmpfile{$url}) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...

    my $hash = unpack ("%16C*", $url);
    my $h = sprintf ("%04x", $hash);
    my $acc = 'aaa';
    my $name = $outtmp.$slash.$h.$acc.$type;

    while (defined $tmpfile_assigned{$name})
    		{ $name = $outtmp.$slash.$h.(++$acc).$type; }

    &add_page_tmpfile ($name, $url);
    &main::dbg ("page file for $url: $page_to_tmpfile{$url}");
  }

  $page_to_tmpfile{$url} =~ /[\\\/\:]([-_\.A-Za-z0-9]+)$/;
  ($page_to_tmpfile{$url}, $1.$anchor);
}

sub clear_page_tmpfiles {
  %page_to_tmpfile = ();
}

sub add_page_tmpfile {
  my $fname = shift;
  my $url = shift;
  $page_to_tmpfile{$url} = $fname;
  $tmpfile_assigned{$fname} = 1;
}

sub href_to_singlepage_anchor {
  my $anchor = shift;
  $anchor =~ s/#/__HASH__/g;
  $anchor =~ s/[^-_A-Za-z0-9]/_/g;
  $anchor;
}

# ---------------------------------------------------------------------------

sub warn_log {
  my $msg = join ('', @_); chomp $msg;
  &log ("Warning: ", $msg);
  print STDERR @_;
}

sub die_log {
  my $msg = join ('', @_); chomp $msg;
  &log ("Fatal: ", $msg);
  print STDERR @_; &cleanexit(2);
}

sub log {
  if (defined fileno LOGFILE) { print LOGFILE @_, "\n"; }
}

sub journal {
  if (defined fileno JOURNAL) {
    my $tag = shift;
    my $lines = join("", @_); $lines =~ s/^/$tag:\t/gm;
    print JOURNAL $lines, "\n";
  }
}

sub dbg {
  if ($main::debug != 0) {
    my $msg = "debug: ".join ('', @_); chomp $msg; &log ($msg);
    print STDERR "debug: ",@_,"\n";
  }
}

sub sitewarn {
  &sitewarn_file_line ($sitewarn_current_site_line, @_);
}

sub sitewarn_file_line {
  my $fname = shift;
  $fname =~ s,^.*[\/\\:]([^\/\\:]+?):\d+?$,$1,o;
  warn "Site \"$fname\": ".join('', @_)."\n";
}

sub verbose {
  if ($main::verbose) {
    my $msg = join ('', @_); chomp $msg; &log ($msg);
    print STDERR @_,"\n";
  }
}

sub cleanexit {
  $SIG{__WARN__} = '';
  $SIG{__DIE__} = '';
  exit @_;
}

sub AbsoluteURL {
  local ($baseurl, $_) = @_;
  s/^\"//; s/\"$//;		# trim quotes if necessary
  s/^\'//; s/\'$//;		# dodgy quotes
  s/^%22//; s/%22.*?$//;	# trim escaped quotes (!!)
  s/&amp;/&/g;			# HTML escapes are not supposed to be in URLs

  if (/^[^\/]+:/) {
    if (!/^(http|file):/) {
      # non-HTTP urls get ignored; don't get URI::URL involved, it'll crash
      return $_;
    }
  }

  use URI::URL;
  my $url = new URI::URL ($_, $baseurl);
  $url->abs->as_string;
}

sub AddHostToURL {
  # a simpler form of AbsoluteURL, used for StoryURL lines.
  # this is necessary because the real thing will escape metacharacters
  # which screws up regexp patterns.

  local ($baseurl, $_) = @_;
  s/^"//; s/"$//;	# trim quotes if necessary
  $_ = &expand_url_magic ($_);	# allow [[MM]] etc. keywords in these patterns

  if (m,^[^/]+://,) {
    # do nothing, it's fully-qualified
  } elsif (m,^/,) {
    $baseurl =~ m,^([^/]+://[^/]+)/, and ($_ = $1.$_);
  }
  $_;
}

sub URLWithoutAnchor {
  my $url = shift;
  if ($url =~ /#/) {
    $`;
  } else {
    $url;
  }
}

sub URLAnchor {
  my $url = shift;
  if ($url =~ /#/) {
    "#".$';
  } else {
    "";
  }
}

sub mm_to_monthname {
  my @months = qw(x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
  $months[$_[0]];
}

sub get_extra_date {
  my $time = shift;
  my ($x, $wday, $min, $hr);
  ($x,$min,$hr,$x,$x,$x,$wday,$x,$x) =
  		localtime(defined $time ? $time : time);
  my @days = qw(Sun Mon Tue Wed Thu Fri Sat);
  ($min, $hr, $days[$wday]);
}

sub get_date {
  my $time = shift;
  my ($x, $mday, $mon, $year);
  ($x,$x,$x,$mday,$mon,$year,$x,$x,$x) =
  		localtime(defined $time ? $time : time);
  $mon++; $year += 1900;
  ($mday, $mon, $year, &mm_to_monthname($mon));
}

sub time2datestr {
  my $time = shift;
  my ($dd, $mm, $year, $mon) = &get_date ($time);
  "$mon $dd $year";
}

sub match_url {
  my $url = shift;
  my $pat = &expand_url_magic (shift);
  $url = &URLWithoutAnchor($url);
  ($url =~ m#^${pat}$#);	# TODO -- trap errors here
}

sub expand_url_magic {
  local ($_);
  my $url = shift;
  my ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr);

  if (!defined $match_url_yyyy) {
    ($match_url_dd, $match_url_mm, $match_url_yyyy, $match_url_Mstr)
    				= &get_date;
    $match_url_yy = $match_url_yyyy; $match_url_yy =~ s/^\d\d//; # trim century
    $match_url_mm = "0$match_url_mm" unless ($match_url_mm =~ /^..$/);
    $match_url_dd = "0$match_url_dd" unless ($match_url_dd =~ /^..$/);
  }

  $url =~ s/\[\[YYYY\]\]/${match_url_yyyy}/g;
  $url =~ s/\[\[YY\]\]/${match_url_yy}/g;

  $url =~ s{\[\[MM([\+\-]\d+|)\]\]}{
    &offset_month($match_url_mm, $1);
  }ge;

  $url =~ s{\[\[M([\+\-]\d+|)\]\]}{
    &offset_month($match_url_mm, $1)+0;
  }ge;	# single-digit if poss

  $url =~ s{\[\[Mon([\+\-]\d+|)\]\]}{
    &mm_to_monthname (&offset_month($match_url_mm, $1));
  }ge;
  $url =~ s{\[\[mon([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname (&offset_month($match_url_mm, $1));
    tr/A-Z/a-z/; $_;
  }ge;
  $url =~ s{\[\[MON([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname (&offset_month($match_url_mm, $1));
    tr/a-z/A-Z/; $_;
  }ge;

  $url =~ s/\[\[DD\]\]/${match_url_dd}/g;
  $url =~ s{\[\[D\]\]}{ $match_url_dd+0; }ge;	# single-digit if poss
  $url;
}

sub get_layout_param {
  my $parmname = shift;
  my $key = shift;
  my $url = shift;
  my $ret;
  my $code;

  if (!defined $key) {
    print "get_layout_param with undefined key: $parmname $url ".caller()."\n";
    return undef;
  }

  # Highest priority, check for an ExceptionURL rule.
  if ($#exceptions >= 0) {
    my $pat;
    my $keyprefix = '';
    if ($key =~ /^\d+ /) { $keyprefix = $&; }
    $code = '
      foreach $pat (@exceptions) {
	next unless (match_url ($url, $pat));
	if (defined $'.$parmname.'{$keyprefix.$pat}) {
	  $ret = $'.$parmname.'{$keyprefix.$pat}; last;
	}
      }
    1';

    eval $code or die "eval failed: $@";
    return $ret if (defined $ret);
  }
  
  # check for a parameter defined in the site file for this site first.
  $code = '
    if (defined $'.$parmname.'{$key}) { $ret = $'.$parmname.'{$key}; }
  1';
  
  eval $code or die "eval failed: $@";
  return $ret if (defined $ret);

  # nope -- now check the layouts. Eval the lot for better speed.
  if ($#layouts >= 0) {
    my $pat;
    my $keyprefix = '';
    if ($key =~ /^\d+ /) { $keyprefix = $&; }
    $code = '
      foreach $pat (@layouts) {		# perky! ;)
	next unless (match_url ($url, $pat));
	if (defined $'.$parmname.'{$keyprefix.$pat}) {
	  $ret = $'.$parmname.'{$keyprefix.$pat}; last;
	}
      }
    1';

    eval $code or die "eval failed: $@";
  }

  $ret;
}

sub offset_month {
  my $mm = shift;
  my $offset = shift;
  if ($offset ne '') { $mm += $offset; }
  if ($mm < 1 || $mm > 12) { $mm = ((($mm-1)+12) % 12)+1; }
  $mm = "0$mm" unless ($mm =~ /^..$/);
  $mm;
}

sub writing_doc {
  ($main::outstyle == $main::OUT_DOC);
}

sub writing_html {
  ($main::outstyle == $main::OUT_HTML);
}

sub writing_text {
  ($main::outstyle == $main::OUT_TEXT);
}

#===========================================================================

package Portability;

sub MyOS {
  if (defined ($Portability::MY_OS)) { return $Portability::MY_OS; }

  # FIGURE OUT THE OS WE'RE RUNNING UNDER
  # Some systems support the $^O variable.  If not available then require()
  # the Config library.  [nicked from CGI.pm -- jmason]

  my $os;
  unless ($os) {
    unless ($os = $^O) {
      require Config;
      $os = $Config::Config{'osname'};
    }
  }

  if ($os=~/win/i) {
    $os = 'Win32';
  } elsif ($os=~/vms/i) {
    $os = 'VMS';
  } elsif ($os=~/mac/i) {
    $os = 'Mac';
  } elsif ($os=~/os2/i) {
    $os = 'OS2';
  } else {
    $os = 'UNIX';
  }
  $Portability::MY_OS = $os;
}

1;

#---------------------------------------------------------------------------

package ScoopHTTP::UserAgent;
use LWP::UserAgent;

BEGIN {
  @ISA = qw(LWP::UserAgent);
  @ScoopHTTP::UserAgent::PasswdMask =
  	unpack ("c*", "Ish0ulDReallY#BeDoING|05th1S>wiTh".
			"5omEThInG+STr0NgeR1kNoW}iKNOw!~");
}

sub new {
  my($class) = @_;
  my $self = new LWP::UserAgent;
  $self = bless $self, $class;
  $self;
}

sub get_basic_credentials {
  my ($self, $realm, $uri, $proxy) = @_;

  $ScoopHTTP::UserAgent::last_realm = $realm;

  if (defined $site_logins{$realm} && defined $site_passes{$realm}) {
    &main::verbose ("(using already-set password for $uri $realm)");

  } else {
    warn ("Need a password to access $realm ($uri).\n");
    if ($main::cgimode || !-t) { return undef; }

    print STDERR ("Username: ");
    my $user = <STDIN>; chop $user;

    print STDERR ("Password: ");
    (&Portability::MyOS eq 'UNIX') and system ("stty -echo");
    my $pass = <STDIN>; chop $pass;
    (&Portability::MyOS eq 'UNIX') and system ("stty echo"); print STDERR "\n";

    $site_logins{$realm} = $user;
    $site_passes{$realm} = $pass;
  }

  ($site_logins{$realm}, $site_passes{$realm});
}

sub load_logins {
  if (defined %site_logins) { return %site_logins; }

  %site_logins = ();
  %site_passes = ();
  open (IN, '< '.$main::user_tmpdir.$main::slash.'site_logins') or return undef;

  #$site_logins{'tst'} = $site_passes{'tst'} = "jmason"; &save_logins;

  while (<IN>) {
    s/[\r\n]+$//g;
    my ($ver, $user, $pass, $realm) = split (/###/);
    if (defined $realm && $ver+0 == 0) {
      $site_logins{$realm} = $user;

      my @mask = @ScoopHTTP::UserAgent::PasswdMask;
      my @input = split (' ', $pass);
      my $pass_open = '';
      my $i = 0;

      foreach $_ (@input) {
	my $ch = (($_ ^ $mask[$i++ % $#mask]) ^ 0xaa);
	last if ($ch == 0);
	$pass_open .= sprintf ("%c", $ch);
      }

      $site_passes{$realm} = $pass_open;
    }
  }
  close IN;

  #print "[", $site_logins{'tst'}, "][", $site_passes{'tst'}, "]\n"; exit;
}

sub save_logins {
  if (!defined %site_logins) { return; }
  my $towrite = '';

  foreach $realm (sort keys %site_logins) {
    next unless (defined $site_passes{$realm} && defined $site_logins{$realm});

    my @mask = @ScoopHTTP::UserAgent::PasswdMask;
    my @input = (unpack ("c*", $site_passes{$realm}));
    my $pass_disguised = '';
    my $i = 0;

    foreach $_ (@input) {
      $pass_disguised .= (($_ ^ 0xaa) ^ $mask[$i++ % $#mask]) . " ";
    }
    while ($i < int(($#input / 16) + 1) * 16) {
      $pass_disguised .= ((0 ^ 0xaa) ^ $mask[$i++ % $#mask]) . " ";
    }
    chop $pass_disguised;

    $towrite .= "0###". $site_logins{$realm}. "###". $pass_disguised.
    		"###". $realm. "\n";
  }

  # again, all at once to minimise contention
  open (OUT, '> '.$main::user_tmpdir.$main::slash.'site_logins') or
  	(warn ("failed to write to site_logins file!\n"), return);
  print OUT $towrite;
  close OUT or warn ("failed to write to site_logins file!\n");
}

1;

#---------------------------------------------------------------------------

package StripTablesFilter;
# almost straight from the HTML::Filter pod doc! Thanks to Gisle Aas for
# writing this.

use HTML::Filter;
use HTML::Parser;

BEGIN {
  @ISA = qw(HTML::Filter);
}

sub start {
  my $self = shift;

  if ($_[0] eq "td") {
    if (!defined ($self->{td_strip})) { $self->{td_strip} = 0; }

    if ($self->{td_strip}) {
      $self->{td_strip}++;
#print STDERR "JMD td strip on: $self->{td_strip}\n";

    } elsif (defined (${_[1]}{'width'})) {
      my $width = ${_[1]}{'width'};
      my $omit = 0;

#print STDERR "JMD td strip width: ".$width."\n";
      if ($width =~ /^(\d+)%$/i) {
	if ($1+0 < 40) { $omit = 1; }
      } else {
	if ($width+0 < 250) { $omit = 1; }
      }
      if ($omit) {
	&main::dbg ("table item ".$_[3]." omitted");
	$self->{td_strip}++;
      }
    }
  }

  $self->SUPER::start(@_);
}

sub end {
  my $self = shift;
  $self->SUPER::end(@_);

  if (defined $_[0] && $_[0] eq "td") {
    if (defined $self->{td_strip} && $self->{td_strip} > 0)
    		{ $self->{td_strip}--; }
#print STDERR "JMD td strip off: $self->{td_strip}\n";
  }
}

sub output {
  my $self = shift;

#print STDERR "JMD output: ".$self->{td_strip}." $_[0]\n";
  if (!defined $self->{td_strip} || $self->{td_strip} == 0) {
    push(@{$self->{fhtml}}, $_[0]);
  }
}

sub filtered_html {
  my $self = shift;
  join("", @{$self->{fhtml}})
}

1;

#---------------------------------------------------------------------------
# 
#CGI package ScoopCGI;
#CGI 
#CGI $cgi_cookie = undef;
#CGI 
#CGI sub set_cookie {
#CGI   my ($userid) = @_;
#CGI   $cgi_cookie = $main::cgi->cookie(-name=>'sitescooper', -value=>"$userid");
#CGI   print $main::cgi->header(-cookie=>$cgi_cookie);
#CGI }
#CGI 
#CGI sub get_cookie {
#CGI   my $cookie = $main::cgi->cookie('sitescooper');
#CGI   return unless defined ($cookie);
#CGI 
#CGI   my ($uid, $x) = split ('#', $cookie);
#CGI   ($uid =~ /(\d+)/) and ($main::userid = $1);
#CGI }
#CGI 
#CGI sub print_input_form {
#CGI   # REVISIT
#CGI }
#CGI 
#CGI sub print_results_links {
#CGI   # REVISIT
#CGI }
#CGI 
#CGI sub get_prc_file {
#CGI   # REVISIT
#CGI }

1;

# TODO:
#
# URLs at end like [1] this
# finish CGI support
# CGI: finish cookie userid support -- passwords
#
#---------------------------------------------------------------------------
# vim:sw=2:tw=74:
