#!/local/bin/perl # narf. News Article Rejection Filter. # Visit narf's web page at # http://utcc.utoronto.ca/abuse/antispam-stuff/narf/ $::NARFVERSION = "0.9301"; # global so filter can use it # # This program attempts to emulate INN enough for the INN antispam # filters to cut in and work well. Like INN, it runs forever. # However, it's not INN. It works by filtering CNews batches from # one directory (the source) to another (the destination), exploding # them into articles in transit and passing each article through # the INN antispam filter (of your choice; it uses filter_innd.pl # intact, so drop in your favorite one). Only accepted articles are # written out to the new batch files. # We use: http://www.exit109.com/~jeremy/news/antispam.html # Some portions of this code do depend on the filter being used, # especially the saveemphist and loademphist routines. # # Usage is: # -v - verbose # -V NUM - give an explicit verbosity level # -D - dryrun, don't actually remove files or write # destination batches. # -O - run only once and then exit, don't sleep # -c NUM - do only NUM batches at most in any pass # (otherwise a pass processes all batches) # -m MAXSIZE - maximum size of articles accepted (including headers) # -b BYTES - write destination batches of at most BYTES bytes # -s/-e/d DIR - set source, destination, or error batch directory # -q DIR - write certain sorts of rejected articles to DIR # -l FILE - set logfile # -w SECS - wait SECS seconds between passes. # -P PIDFILE - file to write our PID in # -C CONFFILE - configuration file loaded periodically to allow # for adjustments. # -S STOPFILE - if this file exists, stop processing and wait until # it stops existing before resuming. # -E EMPFILE - file to load and save EMP history from. # -X - flip eXtra reporting about rejected messages on # (or off) # -N - flip logging newsgroups of rejected messages on # (or off) sub usage { print "usage: $0 [-vDOXN] [-c NUM] [-m MAXSIZE] [-b BYTES] [-s SRCDIR] [-e ERRDIR] [-d DESTDIR] [-l LOGFILE] [-w SECS] [-P PIDFILE] [-C CONFFILE] [-S STOPFILE] [-E EMPFILE] [-q DUMPDIR]\n"; exit 1; } # XXX: update the getopts string down at the bottom when one updates this. # We require a recent perl. This probably still runs under perl 5.002, # but I'm reluctant to assert that without testing it. use 5.004; # tested only under perl 5.004. # The name narf was contributed by Paul Kern. Chris Siebenmann's original # name for it was much less interesting. # # Bugs: this needs more theory of operation information. # The documentation should be in POD format instead of in comments. # # Digression on output batch sizes: # The algorithm for clipping the output batch size is naieve. # Basically, at the end of every input batch, if the output batch # is equal to or larger than the batch maximum size a new output # batch is started. # A new output batch is always started for each directory run, # even if the last one only wrote a small batch; we don't hold a # batch open while we sleep. # # Signals: # Sending a SIGHUP asks the program to close and reopen files (the # log file and the output batch file) so you can change them. It's # not all that useful, because the program runs so fast that it will # likely do it on its own soon. SIGHUP also asks narf to save its # EMP history if possible. # # Sending a SIGTERM asks it to exit after the next batch has finished. # # Speed: disgustingly fast. About 30 megs of batches a minute on an Indy. # Memory usage: potentially large, but then it's rejecting spam for you. # It does have to read each article into memory and potentially copy it # around some; if this bothers you, it can be set to discard articles # that are too big without looking at them at all. (Such articles won't # have their message-id logged, since it would have to look at the # article to find that.) # # Log format: # If it starts with a -, it's a rejected article. Format: # - MSGID RNEWS-SRC SIZE REASON # MSGID may be '-' if there was none. # If it starts with a +, it's an accepted article. Format: # + MSGID RNEWS-SRC SIZE # If it starts with a '!', it's an error. Freeform. # If it starts with a ':', it's an information message. Freeform. # If it starts with a '_', it's extra information about a rejected article. # If it starts with a '=', it's extra information about an article. This is # mostly freeform, but it always starts with '= WHAT-SORT' and the # usual WHAT-SORT will have a constant format. # Use of 'use strict' requires us to pass filehandles via '*FILEHANDLE{IO}', # which I'm not sure I understand at all. But sages tell me that this is # the way to go (both 'use strict' and filehandles). use strict; # --------------------------------------------------------------------- # Configuration section. # The following are the standard CNews configuration variables; we # default to deriving the location of everything else from them. # Generate these from your $NEWSCONFIG file. my $NEWSCTL = "/news/lib"; my $NEWSBIN = "/news/bin"; my $NEWSARTS = "/news/spool"; my $NEWSOV = "/news/spool"; my $NEWSPATH = "/local/bin:/bin:/usr/bin:/usr/ucb"; my $NEWSUMASK = 022; my $NEWSCONFIG = "/news/lib/bin/config"; my ($FILTDIR, $SRCDIR, $DSTDIR, $BADDIR, $LOGFILE, $PIDFILE, $STOPFILE, $CONFFILE); # File and directory locations. # Where to find the filter_innd.pl file. # NOTEZ BIEN: this file will be RELOADED when CHANGED. It is possible # to bring the entire daemon down by changing your filter_innd.pl to # do something stupid (like 'exit'). $FILTDIR = "$NEWSCTL/bin/narf"; # Where the incoming batches live $SRCDIR = "$NEWSARTS/in.nntp"; # Where destination batches go $DSTDIR = "$NEWSARTS/in.coming"; # Where bad batches go. By default this is the destination, where # the real software may be able to demangle them. Note that the # bad directory and the source directory have to be on the same # filesystem, so that one can rename() a batch from $SRCDIR to # $BADDIR. $BADDIR = $DSTDIR; # Where certain sorts of rejected articles are written. # if this is not set, they are not written at all. $::DUMPDIR = "$SRCDIR/rejected"; # The log file $LOGFILE = "$NEWSCTL/narf.log"; # Our PID file $PIDFILE = "$NEWSCTL/narf.pid"; # Our stop file: $STOPFILE = "$NEWSCTL/narf.stop"; # A config file to (re)load periodically. Call this a crude IPC # method to talk to us. $CONFFILE = "$FILTDIR/narf.conf"; # The file used to save (and reload) the EMP history we've accumulated # This is a global so that narf.conf can reset it after oopsies like # a narf.emps file with the wrong ownership. (Don't laugh, it happened # to us once.) $::EMPFILE = "$FILTDIR/narf.emps"; # Load the filter to define variables. # Die if it doesn't define things we'll need later. my ($MSGHIST); do "$FILTDIR/filter_innd.pl" or die "Filter load error"; die "Filtering functions not defined" unless (defined &filter_mode && defined &filter_art); # -- The message-id cache -- # # In INN, the filters are run on a stream of articles that has already # had duplicates suppressed. In narf, we are running on batchfiles, # which may come from many sources, all operating independantly. For # correct filter operation we may need to suppress duplicates manually # within the window of articles that the filter itself keeps history # for; to do otherwise would risk the filter erroneously deciding that # something had been posted too many times because it saw several copies # of the same article. # # *However*, this may not actually be the case. If you use the Paul # Vixie message-id daemon (recommended) and your system can in general # keep up, then you will only see duplicates in the batch stream in # very rare cases (we see in the single digits a day). If this is so # then you can turn the message-id history cache off entirely and save # memory. We do not recommend this unless you are *entirely sure* you # know what you are doing. Ideally you will have real measurements of # how often you see dups and will understand this. my $DoMSGHist = 1; # OFF ONLY IF YOU ARE SURE # Narf's message id cache must be sized to (usually) be larger than # the number of articles the filter keeps history for. This attemps # to get it right for our filter or Jeremy Nixon's cleanfeed. Alter # it for any other filter. $::ArticleHistSize = 10000 if (!defined $::ArticleHistSize); $::BIHistSize = $::ArticleHistSize if (!defined $::BIHistSize); $MSGHIST = $::BIHistSize + 100; # extra slop for obscure cases # Message-IDs are cached in @msgids and %msgids, the latter for speed and # the former for FIFO. # # YYY: The following is a hack, but it can save us thrashing. We know # that the @msgids array will eventually grow to be $MSGHIST+2 large # at most ($MSGHIST, plus the current article, plus a cancel), so we # pre-allocate it this large. This will do an extra shift() through # the core loop for the first $MSGHIST articles, but this is trivial # in the long term. my (@msgids, %msgids); $#msgids = $MSGHIST+2 if ($DoMSGHist); # global variables: my ($pleasestop, $hupbing, $savestate, $runonce, $dryrun, $cancreject); $pleasestop = 0; # SIGTERM sets this to one, for an orderly shutdown $hupbing = 0; # SIGHUP, ditto. Forces buffer flushes. $savestate = 0; # asks us to save saveable state. # options that can or should only be set at program startup. $runonce = 0; # run only once $dryrun = 0; # do we not do file operations? # (creating a real destination batchfile, # removing input batch files) $cancreject = 1; # reject cancels for spam we've rejected when # we can. # options that might be tweaked while we run. $::MAXARTSIZE = 0; # no maximum article size. $::waitint = 120; # how long to sleep while waiting for work. $::batchsize = 1300000; # how many bytes a finished batch can be before # we rotate it. This is large so that we can # pass relaynews large batches to amortize the # overhead. $::maxbatches = 0; # how many batches we process in one pass $::xtra = 1; # report extra details about rejected articles $::logngs = 0; # log newsgroups in _ lines $::logsigs = 1; # log article signatures in _ lines $::verbose = 3; # verbosity. The meaning of verbosity: # 0 - log only rejections # 1 - also log accepted articles # 2 - also log the start and end of runs (with times) # 3 - also log the start and end of batches (with times) # 4 - print some debugging output # 5 - print some more debugging output # Stave off memory fragmentation by only copying so much of an article # body into the __BODY__ element of %::hdr (important in the face of # people posting articles that are 600k, 900k, 1M, or even 3M in size). # This is dangerous if the filter thinks it is looking at the entire # article in __BODY__. # If it is safe, the filter can create the global variable $::MaxArtSize, # which is the number of bytes of article body at and above which the filter # behaves no differently regardless how many more bytes the body has. Narf # will automatically pick up this value. # If your filter does not and you want to trim anyways, set $::MAXARTCOPY # here. But you're probably better off setting $::MaxArtSize in your filter. my ($maxartcopy); $::MAXARTCOPY = 0; # default; allow filter to # determine the right value # $maxartcopy is the working variable. *Don't* set it here. # This hash maps the lower-case version of a header to the canonically cased # version thereof that filter_innd.pl may depend on. It is in no particular # order and I hate it. my (%propcase); %propcase = qw/message-id Message-ID control Control sender Sender subject Subject lines Lines from From nntp-posting-host NNTP-Posting-Host newsgroups Newsgroups followup-to Followup-To content-type Content-Type organization Organization content-disposition Content-Disposition content-base Content-Base references References x-cancelled-by X-Cancelled-By x-canceled-by X-Canceled-By x-cancelbot X-Cancelbot x-mailer X-Mailer/; # this is used later. #$propcase = join "|", keys %propcase; # # We maintain a seperate cache of cancel messages for spam that we # have rejected (well, we do if cancel rejection is enabled). # This cache is independant of the message-id cache and is sized # in a completely different manner. Some statistical crunching on # our logfiles has led me to this number as excellent. Crunch yours # if desired to see for yourself. # # Our stats suggest that you can get half of everything with a # 200-message cache; almost 60% with 500; almost 65% with 1000; # somewhat under 80% with 2000; 85% with 3500; and about 90% # with 5000. Getting to 95% requires around 20,000 entries. # How much of a drop rate you want will depend on your feelings # about how much you want to offload from relaynews and your control # spool. $::CMSGHIST = 5000; # we're generous. my (@cmsgids, %cmsgids); # duplicate the message-id hack. $#cmsgids = $::CMSGHIST+1 if $cancreject; # --------------------------------------------------------------------- # Catch SIGHUP and SIGTERM and react appropriately. sub catch_shutdown { $pleasestop = 1; } sub catch_hup { $hupbing = 1; $savestate = 1; } $SIG{TERM} = \&catch_shutdown; $SIG{HUP} = \&catch_hup; # ---------------------------------------------------------------------- # Subroutines to deal with a cache of message-IDs. # Although using 'my ($msg) = @_;' appears slightly faster, it does # involve an extra copy and thus a bit of memory thrashing. We'll pay # the extra time to avoid that. sub checkmsgid { return $DoMSGHist && exists $msgids{$_[0]}; } sub pushmsgid { return if !$DoMSGHist; # I think using 'undef' to create an undefined value # is more efficient than setting it to '1' or the like; # I could be wrong. push @msgids, $_[0]; $msgids{$_[0]} = undef; } sub trimmsgids { return if !$DoMSGHist; local $^W = 0; # see below while ($#msgids > $MSGHIST) { # we may attempt to delete an undefined element here, # because the pre-filled @msgids cache has the front # full of undefined elements, so if we haven't filled # it with real messages yet we pop one of them and # attempt to delete it. Fortunately this is harmless. delete $msgids{shift(@msgids)}; } } # These variant versions of the above routines simply use a different # cache (that is always present), the cache for cancel suppression. sub checkcmsgid { return $cancreject && exists $cmsgids{$_[0]}; } sub pushcmsgid { return if !$cancreject; push @cmsgids, $_[0]; $cmsgids{$_[0]} = undef; } sub trimcmsgids { return if !$cancreject; local $^W = 0; while ($#cmsgids > $::CMSGHIST) { delete $cmsgids{shift(@cmsgids)}; } } sub delcmsgid { my $k = $_[0]; delete $cmsgids{$k}; @cmsgids = grep (($_ ne $k), @cmsgids); } # ---------------------------------------------------------------------- # Processs a single batchfile. We are passed three filehandles; the first # is the input batch, the second is the output batch, and the third # is the place to report rejections to. It returns 0 if all went well, # 1 otherwise. Note that rejecting articles *is* considered 'going well'. # We go to some lengths to not make new strings in common paths. sub munchbatch { my ($len, $orig, $tlen, $art, $nlen); my ($res, $msg, $k); my ($inbatch, $outbatch, $rejectlog, $bname) = @_; my $first = 1; while (<$inbatch>) { # eat the '#! rnews ...' line, checking and parsing it. chomp; # Does it parse? if (!/^#! rnews \d+ \S+$/) { # the following is a GORY HACK. if (!$first || /^#! rnews /) { print $rejectlog "! unparseable batchfile.\n"; ($::verbose > 4) && print "Bogus start line: $_\n"; return 0; } # Since this is the first article, we assume that # it is a bare article generated via a POST. # get the file size. # stat should by all rights work on $inbatch, but # it does not appear to be a go. Either way from here # we lose, sigh. if (!defined($len = (stat($bname))[7])) { print $rejectlog "! cannot stat $bname for local posting: $!\n"; ($::verbose > 4) && print "Could not stat $bname for local posting: $!\n"; return 0; } # ... and make up a fake #! rnews line, using # the length. $_ = "#! rnews $len LOCAL"; # ... and seek back to the front of the article, # instead of one line in. seek $inbatch, 0, 0; } $first = 0; # no longer the first article. # Trim front ... s/^#! rnews //; # and split. Guaranteed to be valid, too. ($len, $orig) = split(' ', $_); # Is this article too big? if ($::MAXARTSIZE > 0 && $len > $::MAXARTSIZE) { if (!seek($inbatch, $len, 1)) { print $rejectlog "! failed to seek forward $len bytes.\n"; return 0; } # We cannot log the message ID, because we refuse to # read the article. print $rejectlog "- - ", $orig, " ", $len, " EARTTOBIG\n"; next; } # chow down on the actual article. $tlen = read $inbatch, $art, $len; if ($tlen != $len) { print $rejectlog "! wanted $len bytes, got only $tlen.\n"; return 0; } # Blow open the headers. &headercrack($art); # We care more about having a realistic value than being # just defined. A defined and blank message-ID is as useless # as an undefined one. # XXX: Probably we ought not to die. die "Null message-ID!" if ($::hdr{'Message-ID'} eq ''); $msg = $::hdr{'Message-ID'}; # save message-ID. # Reject duplicate message-IDs before calling the filter. # we check the cancel cache first because it's far more # likely to hit. if (&checkcmsgid($msg) || &checkmsgid($msg)) { if (&checkcmsgid($msg)) { print $rejectlog "- ", $msg, " ", $orig, " ", $len, " unneeded/dup cancel\n"; # XXX: although a clever idea for making the cache stretch longer, this # turns out to have an unfortunate performance impact. # &delcmsgid($msg); # streeetch longer } elsif ($::hdr{'Control'}) { print $rejectlog "- ", $msg, " ", $orig, " ", $len, " dup cmsg\n"; } else { print $rejectlog "- ", $msg, " ", $orig, " ", $len, " dup\n"; } next; } else { # remember that we've seen it. # We don't bother pushing the message IDs of control # messages, because the filter won't consider them # spam in any case (well, it had better not or it will # delete things we don't want it to). So we might as # well save space. This also corresponds to what the # filter really tracks, which is what we care about for # dup suppression. (!defined($::hdr{'Control'})) && &pushmsgid($msg); } # Call the filtering stuff; the header cracking has given us # the %hdr associative array that filter_art() wants. $res = &filter_art(); if (length($res) > 0) { # remove any newlines that the filter may have # accidentally stuck in $res. This usually happens # because in perl, \s matches newlines too. $res =~ tr /\n//d; # and print information. print $rejectlog "- ",$msg, " ", $orig, " ", $len, " rejected: ", $res, "\n"; &dumpxtra($rejectlog, $msg, $res) if ($::xtra); &dumpxtranemp($rejectlog, $msg, $res) if ($::xtra && $res =~ /New EMP/); &dumpbadart($rejectlog, $msg, $res, \$art, $orig) if (!$dryrun && ($::xtra || $orig eq 'LOCAL') && $::DUMPDIR); # The following is a heuristic hack. We reject spam; # spam is often cancelled; we have no need of cancels # for things we've already rejected. # We don't do this for things that the filter has # learned, because if the filter is restarted they # could sneak through again before they're relearned. # Nor do we bother immunizing against control messages # (including cancels themselves, of course) if ($res !~ /EMP/ && !$::hdr{'Control'} && $cancreject) { $msg =~ s/^ 3) && print "Canonicalizing header $i\n"; } } # The INN perl patches expect the body in __BODY__ as a # header hash element. Supply it. if ($maxartcopy && $maxartcopy < $blen) { ($::verbose > 3) && print "Only creating $maxartcopy bytes from a true body of $blen bytes.\n"; $::hdr{'__BODY__'} = substr($_[0], $sepin+2, $maxartcopy); } else { $::hdr{'__BODY__'} = substr($_[0], $sepin+2); } # Create a Lines: header if we don't have one. # (I think INN may do this automatically? I don't know. But the # filter depends on it being there. Hmm, inspection suggests that # INN does not add one automatically.) if (!$::hdr{'Lines'}) { my $lcnt; # now it gets complicated. # if we have truncated the article body via $maxartcopy, # then we cannot just count newlines in __BODY__. if ($maxartcopy && $maxartcopy < $blen) { $lcnt = ($_[0] =~ tr /\n//) - 2 - $hl; } else { # simple is OK. $lcnt = ($::hdr{'__BODY__'} =~ tr/\n//); } $::hdr{'Lines'} = $lcnt; # happens too often on control messages to natter about. ($::verbose > 3 && !$::hdr{'Control'}) && print "Created header - Lines: $lcnt\n"; } # We record the size of the article itself because this information # is not readily recoverable from just the %::hdr array's contents, # even neglecting $maxartcopy. It's also cheap. $::hdr{'__SIZE__'} = length($_[0]); } # Identify an article as being from an interesting place, based on various # bits of information. This is a separate routine so that it can be # dynamically reloaded. As such it can't use external my()'d variables, just # true globals. # Return null ("") if no special ID can or should be attributed to the # article. sub dumpxtraid { my ($res) = @_; if ($res =~ /^Netzilla/ || $::hdr{'Path'} =~ /newsfeeds\.com|news\.jam\.comm?|news\.cm\.org!bcarh189\.bnr\.ca|bcarh8c\.bnr\.ca!(?:enzo\.grapa\.it|news\.primenet\.com)|!207\.70\.214\.[0-9]{1,3}!|!199\.5\.259\.66!|!metro.goldwyy?n.au!/) { return 'NETZILLA'; } # no ID possible. return ''; } # Print extra information about a rejected article. # format is: # _ {CONTROL-CMD|-} PATH-TAIL {NNTP-POSTING-HOST|_} SBI {BODYHASH|_} {NEWSGROUPS|NEWSGROUP-COUNT|_} {SPECIAL-ORIGIN} # new things in new narf revisions are added at the END of this. sub dumpxtra { my ($rejectlog, $msg, $res) = @_; my ($hash, $spec, $sbi); my $pathtail = $::hdr{'Path'}; my $nntph = $::hdr{'NNTP-Posting-Host'}; my $cntl = $::hdr{'Control'}; my $ngs = $::hdr{'Newsgroups'}; $sbi = &calcsbi(); # construction beings: print $rejectlog "_ ", $msg; # control message? if ($cntl) { $cntl =~ s/\s.*$//; print $rejectlog ' ', $cntl; } else { print $rejectlog ' -'; } # turn whitespace into an _ in $pathtail. otherwise people # explode horribly all over the floor. I hate people who # have whitespace there... $pathtail =~ tr / \t/_/; # tail of path, if long enough. # includes the special Netzilla exemption to smoke out the people # who feed them. $pathtail = '...' . $1 if ($pathtail =~ /(![^!]+![^!]+!(?:207\.70\.214\.\d+|[^!]*\bnewsfeeds\.com)!.*)$/ || $pathtail =~ /((?:!+[^!]+){7})$/); print $rejectlog ' ', $pathtail; # NNTP posting host. $nntph = ($nntph =~ /^\S+$/) ? $nntph : '_'; print $rejectlog ' ', $nntph; # What's the (S)BI of this article? More than 2 decimal points is # a waste of space and besides, that's how many Ed Falk uses. if ($sbi && (int($sbi) == $sbi)) { print $rejectlog ' ', $sbi; } elsif ($sbi) { printf $rejectlog ' %-.2f', $sbi; } # hello dolly. welcome to hash-land. $hash = &filter_bodyhash() if (defined &filter_bodyhash && !$cntl && $::logsigs); if ($hash) { # change spaces, if any, since we insist on the _ line # being parseable. $hash =~ tr / /_/; print $rejectlog ' ', $hash; } else { print $rejectlog ' _'; } # log newsgroups if desired. if ($::logngs) { $ngs =~ tr/ \t//d; # shouldn't be any, but just in case... print $rejectlog ' ', $ngs; } else { # since SBI looks at Followup-To:, the count of newsgroups # may contain non-redudant information. my $ngc = ($ngs =~ tr /,//) + 1; print $rejectlog ' ', $ngc; } # this spot is dedicated to Netzilla with love and affection and # the best filters we can devise. No, really. # it can be generalized to match on people who attempt to hide their # origin. The intent is that logging scripts can pull out a true # count for them. $spec = &dumpxtraid($res); if ($spec) { print $rejectlog ' ', $spec; } else { print $rejectlog ' _'; } # ... and finish off. print $rejectlog "\n"; } # Dump extra speshul information about new EMPs. # All lines start with '= CLASS'. The only class now is # 'new-emp'. sub dumpxtranemp { my ($rejectlog, $msg, $res) = @_; my ($hash, $l); print $rejectlog "= ", $msg, " new-emp"; $hash = &filter_bodyhash() if (defined &filter_bodyhash); if ($hash) { $hash =~ tr / /_/; } else { $hash = '_'; } print $rejectlog ' ', $hash; if ($::hdr{'Sender'}) { print $rejectlog ' ', $::hdr{'Sender'}; } else { print $rejectlog ' ', $::hdr{'From'}; } print $rejectlog ' | ', $::hdr{'Subject'}, ' | ', $::hdr{'Lines'}; # and wind up print $rejectlog "\n"; } # Calculate the SBI for a given article. # BI: square root of newsgroup count (ngc) # BI2: (ngc + sqrt(ngc))/2 # SBI: as BI2, but using min(ngc, followup-to count) sub calcsbi { my $ngc; my $ftc; $ngc = ($::hdr{'Newsgroups'} =~ tr /,//) + 1; if ($::hdr{'Followup-To'}) { $ftc = ($::hdr{'Followup-To'} =~ tr /,//) + 1; $ngc = $ftc if $ftc < $ngc; } return ($ngc + sqrt($ngc))/2; } # Dump an article if and as required. sub dumpbadart { my ($rejectlog, $msg, $res, $aref, $src) = @_; my $fname; my $alen = length($$aref); # do we dump it anywhere? $fname = &filter_logname($res, $src) if (defined &filter_logname); # rejected local postings are ALWAYS saved. Always always. return unless ($fname || ($src eq "LOCAL")); # rejected local postings are marked specially. $fname = "local" . ($fname ? '-' . $fname : '') if ($src eq "LOCAL"); if (!open(ARTDUMP, ">>$::DUMPDIR/$fname")) { warn "Cannot open $::DUMPDIR/$fname: $!"; $::DUMPDIR=(); return; } if (!(print ARTDUMP "#! rnews ", $alen, " ", $fname, "\n", $$aref)) { warn "Could not write $::DUMPDIR/$fname: $!"; $::DUMPDIR = (); return; } close ARTDUMP; } # # Deal with a single batchfile. At this level we are shuffling around # files; munchbatch will eat the spam itself. # We are passed in the name of the batchfile, the output batch filehandle, # and the report file handle, in that order. We return nothing in particular. # The batch filename is sans directories. sub dobatch { my ($bname, $outbatch, $rejectlog) = @_; my ($time, $bfile, $res); # perform checks and open. $bfile = "$SRCDIR/$bname"; if (! -f $bfile) { print $rejectlog "! $bfile doesn't exist or isn't a file.\n"; return 0; } unless (open IBATCH, $bfile) { print $rejectlog "! can't open batchfile $bfile: $!\n"; return 0; } # It's open, so go: $time = localtime; ($::verbose > 2) && print $rejectlog ": start $bfile at $time\n"; $res = &munchbatch(*IBATCH{IO}, $outbatch, $rejectlog, $bfile); # whatever happens, we want to close it. close IBATCH; $time = localtime; if ($res) { # We have succeeded. Log if verbose ($::verbose > 2) && print $rejectlog ": end $bfile at $time\n"; } else { # There has been an error. Failure has already been logged. ($::verbose > 2) && print $rejectlog ": bad $bfile at $time\n"; } return $res; } ## # Scan the source directory looking for work files. If found, process up # to so many of them. How many of them is the sole argument. This is the # major work process, and so it pays attention to $pleasestop, $hupbing, # and so on. # In order to gracefully handle being shut down in the middle of life, # we don't delete the input batch files until we're sure that the output # batch file has been comitted: ie, it has been closed and renamed to the # permanent name. sub rundir { my ($max) = @_; my ($tm, $ent, $time, $rollover, $res, $dest); my (@batches, $batch, @handled); my ($grade, $ngrade); # Get to where we should be. chdir $SRCDIR or die "Cannot chdir to $SRCDIR: $!"; # Begin the scan. opendir(BDIR, ".") or die "Cannot opendir $SRCDIR: $!"; while (defined($ent = readdir(BDIR))) { # skip over bad entries. $ent eq "." || $ent eq ".." and next; # Pick up only valid CNews batchfiles $ent !~ /^\d[\d.]+\.t$/ and next; push @batches, $ent; } closedir BDIR; # No work to do? return immediately. return if (scalar(@batches) == 0); # To do this properly, we must now sort the list of batches # into lexical (aka arrival time) order. @batches = sort @batches; # Take the first N of them, if desired. We do this by # truncating the list. if ($max > 0 && $#batches >= $max) { my ($t) = scalar(@batches); ($::verbose > 4) && print "Have too many batches ($t), truncating to $max of them.\n"; $#batches = $max-1; } # We now have some collection of batches. # In order to process them, we need to open an output batch # and the log. open LOG, ">>$LOGFILE" or die "Cannot open $LOGFILE: $!"; select LOG; $| = 1; select STDOUT; # We open under a temporary safe name. if ($dryrun) { $dest = "/dev/null"; # dryrun doesn't write a real batch } else { $dest = "$DSTDIR/temp-$$"; } # We always TRUNCATE partial output batches that may be left behind. # This can never lose articles (the input batches for a partial output # batch won't have been removed) and guarantees us a good batchfile. # A lingering destination batchfile might be in pretty much any state. open DBATCH, ">$dest" or die "Cannot open destination batch $dest: $!"; # Run over each batch file. $time = localtime; ($::verbose > 1) && print LOG ": starting run at $time\n"; foreach $batch (@batches) { # Does this input batch have a grade? undef $ngrade; $ngrade = $1 if ($batch =~ /^(\d\.)/); # Does this batch have a different grade than the last batch? # If so, we need to force an output batch rollover now. $rollover++ if (defined($grade) && $grade ne $ngrade); # Has the destination batch file grown big enough to warrant # rotation? (Or any of the other reasons to roll it over.) # We roll at the top of the loop instead of at the bottom # in order to properly handle grades. if ($::batchsize <= (stat($dest))[7] || ($rollover && -s $dest)) { close DBATCH or die "Error closing destination batch: $dest: $!"; $tm = time; ($::verbose > 3) && print "rolling $dest to $DSTDIR/$grade$tm.t due to large size.\n"; if (!$dryrun) { rename($dest, "$DSTDIR/$grade$tm.t") or die "Error renaming $dest to $DSTDIR/$tm.t: $!"; } # With the output batchfile flushed and properly # renamed, we can now purge all the handled input # batches. &delbatches(@handled); @handled = (); # null the list. # Reopen the batchfile, of course. open DBATCH, ">$dest" or die "Cannot reopen destination batch $dest: $!"; } $rollover = 0; # Output batch rolled if necessary. # Set the grade of the current batch. $grade = $ngrade; # Call dobatch. ($::verbose > 4) && print "Invoking dobatch on $batch\n"; $res = &dobatch($batch, *DBATCH{IO}, *LOG{IO}); # Handle success and failure. if ($res) { # Success: push the done batchfile. push @handled, $batch; } else { # Failure causes an immediate batchfile flush. $rollover++; # plus saving of the bad batchfile. ($::verbose > 1) && print LOG ": bad batchfile $batch at $time\n"; ($::verbose > 3) && print "saving bad batchfile $SRCDIR/$batch to $BADDIR/$batch\n"; if (!$dryrun) { $time = localtime; rename("$SRCDIR/$batch", "$BADDIR/$batch") or print LOG "! failed to rename $SRCDIR/$batch to $BADDIR/$batch: $!\n"; } } # Force a flush of data to the outgoing batch file, since # we will soon check its size. # (We cannot avoid this by using '-s DBATCH'; thanks acres, # perl.) select DBATCH; $| = 1; select STDOUT; select DBATCH; $| = 0; select STDOUT; # After each batch is processed we check some important # cases. First: shutdown signalled? if ($pleasestop || -e $STOPFILE) { ($::verbose > 1) && print LOG ": ending run on signal or stopfile.\n"; ($::verbose > 3) && print "Ending run due to signal or stopfile.\n"; # Fall through the bottom of the loop. last; } # Do we want to redo the log file? if ($hupbing) { ($::verbose > 3) && print "reopening $LOGFILE from signal\n"; close LOG or warn "Error closing $LOGFILE: $!\n"; open LOG, ">>$LOGFILE" or die "Cannot open $LOGFILE: $!"; select LOG; $| = 1; select STDOUT; $hupbing = 0; $rollover++; # roll the output batch too. } # batch rollover is handled at the top of the loop, because # of grades. } # Our run has ended. We must close files and roll the destination # batch over, if it's non-zero. close DBATCH or die "Error closing destination batch $dest: $!"; if (-z $dest && -f $dest) { # We were SO efficient at despamming we deleted EVERYTHING. # Remove the destination. ($::verbose > 3) && print "deleting zero-length destination batch $dest\n"; if (!$dryrun) { unlink($dest) or die "Cannot unlink $dest: $!\n"; } } else { $tm = time; ($::verbose > 3) && print "renaming $dest to $DSTDIR/$grade$tm.t\n"; if (!$dryrun) { rename($dest, "$DSTDIR/$grade$tm.t") or die "Error renaming $dest to $DSTDIR/$tm.t: $!\n"; } } # Having flushed and successfully renamed the output batchfile, we # purge the input batchfiles. &delbatches(@handled); # Close log and clean up the shop. $time = localtime; ($::verbose > 1) && print LOG ": run ending at $time\n"; close LOG or warn "Error closing $LOGFILE: $!\n"; return; } # Delete handled batches. sub delbatches { my (@handled) = @_; my ($t); while (scalar(@handled)) { $t = pop @handled; ($::verbose > 3) && print "Removing handled batchfile $t\n"; if (!$dryrun) { unlink "$SRCDIR/$t" or print LOG "! Cannot remove source batchfile $SRCDIR/$t: $!\n"; } } } # -------------------------------------------------------------------- # Save and reload the EMP history. This is deep friendship with the # internals of the INN filter we use ... but it's very nice to have # the history preserved over shutdowns. sub loademphist { ($::verbose > 3) && print "Loading EMP history from $::EMPFILE.\n"; open EMPHIST, "<$::EMPFILE" or die "Cannot open EMP history $::EMPFILE: $!"; while () { chomp; $::EMP{$_} = -1; # -1 serves as a convenient 'loaded # from narf' indicator, and it's still # true in perl. push(@::EMP_Hist, $_); } close EMPHIST; # if it errors on close, who cares? } sub saveemphist { my ($k); # Force sync if necessary. &filter_sync() if (defined &filter_sync); return unless (defined %::EMP && defined @::EMP_Hist); ($::verbose > 3) && print "Saving EMP history to $::EMPFILE.\n"; if (!(open EMPHIST, ">$::EMPFILE")) { warn "Can't open EMP history $::EMPFILE: $!"; return; } foreach $k (@::EMP_Hist) { if (!(print EMPHIST $k, "\n")) { warn "Can't save an EMP key: $!"; close EMPHIST; return; } } close EMPHIST or warn "Can't close EMP historry $::EMPFILE: $!"; } ## -------------------------------------------- # reset anything as necessary or useful after a filter reload. # This is also run when we start up. sub postload_hook { # maxartcopy is the lower of $::MAXARTCOPY and $::MaxArtSize if both # are defined, or whichever one of them is defined, or isn't defined. if ($::MAXARTCOPY && $::MaxArtSize) { $maxartcopy = ($::MAXARTCOPY < $::MaxArtSize) ? $::MAXARTCOPY : $::MaxArtSize; } elsif ($::MaxArtSize) { $maxartcopy = $::MaxArtSize; } elsif ($::MAXARTCOPY) { # filter has set no max but the configuror claims to know # better. $maxartcopy = $::MAXARTCOPY; } else { # The filter asserts that every byte is sacred and the # installer has not overridden it. Believe it. $maxartcopy = 0; } ($::verbose > 3) && do { print "MAXARTCOPY ", $maxartcopy ? "wired to $maxartcopy\n" : "off\n"; } } ## -------------------------------------------- # Main code. # There actually isn't much here. # I guess I did all the work earlier. # Random geegaws assembled with a chainsaw. use Getopt::Std; getopts('V:vDc:m:b:s:d:e:l:w:OP:C:S:E:Xq:N') || &usage(0); # Handle entirely too many options. # odd || usage is to shut up perl -w. $dryrun ++ if ($::opt_D || $::opt_D); $::verbose ++ if ($::opt_v || $::opt_v); $runonce++ if ($::opt_O || $::opt_O); $::xtra = !$::xtra if ($::opt_X || $::opt_X); $::logngs = !$::logngs if ($::opt_N || $::opt_N); $::verbose = $::opt_V if (defined $::opt_V); $::maxbatches = $::opt_c if ($::opt_c); $::MAXARTSIZE = $::opt_m if (defined $::opt_m); $::batchsize = $::opt_b if ($::opt_b); $SRCDIR = $::opt_s if ($::opt_s); $DSTDIR = $::opt_d if ($::opt_d); $BADDIR = $::opt_e if ($::opt_e); $LOGFILE = $::opt_l if ($::opt_l); $::waitint = $::opt_w if ($::opt_w); $PIDFILE = $::opt_P if ($::opt_P); $CONFFILE = $::opt_C if ($::opt_C); $STOPFILE = $::opt_S if ($::opt_S); $::EMPFILE = $::opt_E if ($::opt_E); $::DUMPDIR = $::opt_q if ($::opt_q); # Enforce that we have, like, no command line arguments. Really. @ARGV != 0 && usage(0); # This may not be required, but it's sane. warn "$0: Warning: -D specified without -O, this may not behave well.\n" if ($dryrun && !$runonce); warn "$0: PID filename '$PIDFILE' not absolute, we hope you know what you're doing.\n" unless ($PIDFILE =~ /^\//); # Insist on a few basic sane things, like the directories existing. chdir $DSTDIR or die "Cannot chdir to batch destination directory $DSTDIR: $!\n"; chdir $BADDIR or die "Cannot chdir to bad batch directory $BADDIR: $!\n"; chdir $SRCDIR or die "Cannot chdir to batch source directory $SRCDIR: $!\n"; if ($::DUMPDIR) { chdir $::DUMPDIR or die "Cannot chdir to rejected article directory $::DUMPDIR: $!\n"; } else { warn "DUMPDIR not configured; rejected local posts will not be saved.\n"; } # Sanity checks: die "Wait interval must be > 0" unless ($::waitint > 0); die "Batch size must be > 0" unless ($::batchsize > 0); # Save our process ID for other people to use. # (Hopefully it's an absolute file name.) open PID, ">$PIDFILE" or die "Cannot open PID file $PIDFILE: $!"; print PID $$, "\n" or die "Cannot write PID to $PIDFILE: $!"; close PID or die "Cannot close PID file $PIDFILE: $!"; # hi there, fans and fanatics ($::verbose > 3) && print "narf version $::NARFVERSION starting at " . localtime, "\n"; my ($omt, $oct, $mt, $ct); # what I put up with... # Save the mod time of the filtering script so we know when to reload it. $omt = (stat("$FILTDIR/filter_innd.pl"))[9]; # Ditto for the 'conf' file. $oct = (stat($CONFFILE))[9] if (-e $CONFFILE); # ... and load, if present. do $CONFFILE if (-e $CONFFILE); # Initialize filtering. &filter_mode(); # takes arguments? &postload_hook(); # Reload our EMP history if we have one. if (-s $::EMPFILE) { &loademphist(); } # Are we going to be able to save to it? if (!(open EMPHIST, ">>$::EMPFILE")) { ($::verbose > 3) && print "EMP history file $::EMPFILE unwriteable, no saves available: $!\n"; undef $::EMPFILE; } close EMPHIST; # Now, run forever. while (!$pleasestop) { # Is the filter stale? If so, reload. $mt = (stat("$FILTDIR/filter_innd.pl"))[9]; if ($omt != $mt && -f "$FILTDIR/filter_innd.pl") { my $time = localtime; ($::verbose > 3) && print "Reloading $FILTDIR/filter_innd.pl.\n"; # Attempt to trap errors. while (!(do "$FILTDIR/filter_innd.pl") || !defined &filter_mode || !defined &filter_art) { warn "Error reloading filter at $time"; ($::verbose > 3) && print "Filter load error, retrying after $::waitint seconds.\n"; sleep $::waitint; # wait, try again. }; &filter_mode(); # takes arguments? &postload_hook(); $omt = $mt; } # By contrast, we could hardly care less if the conf file exists # or loads with errors. if (-e $CONFFILE) { $ct = (stat($CONFFILE))[9]; if ($ct != $oct) { ($::verbose > 3) && print "Loading conf file $CONFFILE.\n"; do $CONFFILE; $oct = $ct; # oh, okay, we're nice... &postload_hook(); } } else { $oct = 0; # no file forces reload if it reappears } # Do a run. # ... but only if the stop file doesn't exist. if (! -e $STOPFILE) { ($::verbose > 4) && print "Scanning, maxbatches $::maxbatches\n"; &rundir($::maxbatches); } else { ($::verbose > 4) && print "Stopped due to $STOPFILE existing.\n"; } if ($savestate && $::EMPFILE) { ($::verbose > 3) && print "Saving EMP history due to signal.\n"; &saveemphist(); $savestate = 0; } # break out if desired. last if ($runonce || $pleasestop); # Sleep for a while. ($::verbose > 4) && print "Going to sleep for $::waitint seconds\n"; sleep $::waitint; # repeat forever. } # XXX: consistency check for memory leaks. &consistchk if (defined &consistchk); # Save the EMP history to date. We know there are no newlines in the # keys. if ($::EMPFILE) { ($::verbose > 3) && print "Saving EMP history on exit\n"; &saveemphist(); } ($::verbose > 3) && print "Exiting due to shutdown signal.\n"; exit 0;