pjs/webtools/newsbot/newsbot.pl

296 строки
8.7 KiB
Perl
Исходник Обычный вид История

1999-03-20 11:19:28 +03:00
#!/usr/bonsaitools/bin/perl -w
# -*- Mode: perl; indent-tabs-mode: nil -*-
#
1999-11-02 02:33:56 +03:00
# The contents of this file are subject to the Mozilla Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
1999-03-20 11:19:28 +03:00
# The Original Code is NewsBot
1999-11-02 02:33:56 +03:00
#
1999-03-20 11:19:28 +03:00
# The Initial Developer of the Original Code is Netscape Communications
1999-11-02 02:33:56 +03:00
# Corporation. Portions created by Netscape are
# Copyright (C) 1998 Netscape Communications Corporation. All
# Rights Reserved.
#
1999-03-20 11:19:28 +03:00
# Contributor(s): Dawn Endico <endico@mozilla.org>
# Harvest pointers to news articles and their summaries from mailbox file
# and write html and rdf files from it.
1999-03-22 03:55:53 +03:00
#
# usage: newsbot [mailfile1, mailfile2...] [rdffile]
# each mail file is standard mbox format such as a sendmail spool file.
# rdffile is where to put the generated rdf.
# output is written to standard output.
1999-03-20 11:19:28 +03:00
require 5.00397;
use strict;
use Mail::Folder::Mbox;
use Mail::Address;
1999-03-22 03:55:53 +03:00
my $rdffile = pop (@ARGV); #name of file to write rdf data
1999-03-20 11:19:28 +03:00
unless (@ARGV) {
# command line argument should be list of mail files to
# process. If none given, use this file.
my $mailfile = "/var/mail/newsbot";
die("No mail\n") if (!-f $mailfile);
push(@ARGV, $mailfile);
}
for my $file (@ARGV) {
my $folder = new Mail::Folder('AUTODETECT', $file);
unless ($folder) {
warn("can't open $folder: $!");
next;
}
printheader();
1999-03-22 03:55:53 +03:00
my %articlehash;
1999-03-20 11:19:28 +03:00
my @articles;
my $index=0;
for my $msg (sort { $a <=> $b } $folder->message_list) {
my $entity = $folder->get_mime_message($msg);
my $submitter = $entity->get('From'); chomp($submitter);
$submitter =~ s/&/&amp;/g;
1999-03-20 11:19:28 +03:00
$submitter =~ s/</&lt;/g;
$submitter =~ s/>/&gt;/g;
my $submitdate = $entity->get('Date'); chomp($submitdate);
#
# This is important for weeding out junk. Submissions must be multipart
# mime messages as created by Messenger when forwarding a news article.
# the first part is text/html or text/plain. The second part should be
# type message/rfc822. This format preserves the header information of
# the original news article (especially the Message-ID). It also makes
# it more difficult for random junk mailed to newsbot to litter the
# web page. We don't want someone to post an article, cc newsbot and
# have a big thread begin where the rest of the messages continue
# being cc'd to newsbot.
#
# 99-05-27: allow message/news in addition to message/rfc822 for the
# second part as this is what communicator 3.x uses. -endico
1999-03-20 11:19:28 +03:00
if ($entity->parts < 2) {
next;
}
my @parts = $entity->parts;
if ( !($parts[0]->head->mime_type =~ /text\/html/) &&
!($parts[0]->head->mime_type =~ /text\/plain/) ) {
next;
}
if ( !( ($parts[1]->head->mime_type =~ /message\/rfc822/) ||
1999-05-28 03:38:06 +04:00
($parts[1]->head->mime_type =~ /message\/news/) )
) {
1999-03-20 11:19:28 +03:00
next;
}
my $IO;
my $summary = "";
if ($IO = $entity->parts(0)->open("r")) {
$summary = $summary . $_ while (defined($_ = $IO->getline));
$IO->close;
if ( $entity->parts(0)->head->mime_type =~ /text\/plain/ ) {
# line beginning with -- is a signature seperator. Delete the sig
$summary =~ s/^--.*//ms;
$summary =~ s/&/&amp;/g;
$summary =~ s/</&lt;/mg;
$summary =~ s/>/&gt;/mg;
1999-03-20 11:19:28 +03:00
$summary =~ s/(http:\/\/([\S])+)/<A HREF=\"$1\">$1<\/A>/mg;
$summary =~ s/(ftp:\/\/([\S])+)/<A HREF=\"$1\">$1<\/A>/mg;
$summary =~ s/&lt;(([\S])+@([\S])+)&gt;/&lt;<A HREF=\"mailto:$1\">$1<\/A>&gt;/mg;
}
}
my $news = "";
if ($IO = $entity->parts(1)->open("r")) {
$news = $news . $_ while (defined($_ = $IO->getline));
$IO->close;
}
# check to make sure this is a news article. If not, skip it.
$news =~ /^Newsgroups: ([^\n]+)/m ;
my $newsgroups = $1;
if (!$newsgroups) {
next;
}
$newsgroups =~ s/(netscape.public.mozilla.([\w-.])+)/\n<A HREF="news:\/\/news.mozilla.org\/$1\">\n $1<\/A>/g;
1999-03-20 11:19:28 +03:00
$news =~ /^Message-ID: <([^>]+)/m;
my $MID = $1;
$news =~ /^From: ([^\n]+)/m;
my $from = $1;
$news =~ /^Subject: ([^\n]+)/m;
my $subject = $1;
$subject =~ s/^Re://ig;
$subject =~ s/^Fwd://ig;
$subject =~ s/&/&amp;/g;
$subject =~ s/</&lt;/g;
$subject =~ s/>/&gt;/g;
1999-03-20 11:19:28 +03:00
$news =~ /^Date: ([^\n]+)/m;
my $date = $1;
my %article;
if (! %articlehash->{"$MID"}) {
%articlehash->{"$MID"}=\%article;
$index += 1;
$articles[$index]=\%article;
}
%article->{'Message-ID'} = $MID;
%article->{'Subject'} = $subject;
%article->{'Date'} = $date;
%article->{'From'} = $from;
%article->{'Newsgroups'} = $newsgroups;
%article->{'Summary'} = $summary;
%article->{'submitter'} = $submitter;
%article->{'submitdate'} = $submitdate;
$entity->purge;
} #for loop
$folder->close;
for (my $i=$index; $i > 0 ; $i--) {
printarticle ($articles[$i]);
}
printfooter();
1999-03-22 03:55:53 +03:00
if ($rdffile) {
printrdf (\@articles, $rdffile);
}
1999-03-20 11:19:28 +03:00
}
1999-03-22 03:55:53 +03:00
sub printrdf() {
my ($ref, $rdffile) = @_;
my @articles = @{$ref};
unless (open (RDFFILE,">$rdffile") ){
die "Couldn\'t open rdf file:\"$rdffile\"\n";
}
select RDFFILE;
my $header =<<'RDFHEAD';
1999-03-23 10:41:19 +03:00
<?xml version="1.0"?>
1999-03-23 10:54:15 +03:00
<rdf:RDF
1999-03-22 03:55:53 +03:00
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
1999-03-23 10:54:15 +03:00
xmlns="http://my.netscape.com/rdf/simple/0.9/">
1999-03-22 03:55:53 +03:00
<channel>
<title>Mozilla NewsBot</title>
1999-03-22 03:55:53 +03:00
<link>http://www.mozilla.org/newsbot/</link>
<description>Pointers to the hottest mozilla newsgroup threads.</description>
1999-03-22 03:55:53 +03:00
</channel>
<image>
<title>Mozilla</title>
1999-03-22 12:46:05 +03:00
<url>http://www.mozilla.org/images/hack.gif</url>
<link>http://www.mozilla.org/newsbot/</link>
1999-03-22 03:55:53 +03:00
</image>
RDFHEAD
print $header;
my $index = @articles - 1;
# only print newest 15 articles
my $min = 0;
if ($index > 15) {
$min = $index - 15;
}
for (my $i=$index; $i > $min ; $i--) {
1999-03-22 03:55:53 +03:00
print (" <item>\n");
print (" <title>" . $articles[$i]->{'Subject'} . "</title>\n");
print (" <link>http://http://groups.google.com/groups?q=msgid:" . $articles[$i]->{'Message-ID'} . "&amp;ic=1</link>\n");
1999-03-22 03:55:53 +03:00
print (" </item>\n\n");
}
1999-03-23 10:58:12 +03:00
print "</rdf:RDF>\n";
1999-03-22 03:55:53 +03:00
} #end printrdf()
1999-03-20 11:19:28 +03:00
sub printarticle() {
my ($artref) = @_;
my %article = %{$artref};
print "\n<P>\n";
print "<A NAME=\"" . %article->{'Message-ID'} . "\"></A>\n";
1999-03-20 11:19:28 +03:00
print "<TABLE border=0 width=100%><TR><TD><B><FONT SIZE=+1>\n";
print %article->{'Subject'} ."\n";
print "</B></FONT>\n";
print "</TD></TR><TR><TD>\n";
print %article->{'Summary'};
1999-10-22 04:14:43 +04:00
print "</TD></TR><TR><TD ALIGN=\"right\">\n";
1999-03-20 11:19:28 +03:00
print "<FONT SIZE=-1>\nPosted: " . %article->{'Date'} ."\n</FONT>";
print "<BR>";
print %article->{'Newsgroups'} . "\n";
print "<BR>\n";
print "<A HREF=\"http://groups.google.com/groups?q=msgid:" . %article->{'Message-ID'} . "&amp;ic=1\">\n";
print "View Article</A>\n";
1999-03-20 11:19:28 +03:00
print "<!--Submitted to NewsBot by: " . %article->{'submitter'} . "-->\n";
print "<!--" . %article->{'submitdate'} . "-->\n";
print "</TD></TR>\n";
print "</TABLE>\n";
}
sub printheader() {
my $header =<<'ENDHEAD';
<HTML>
<HEAD>
<TITLE>newsbot</TITLE>
</HEAD>
<BODY>
<H1>newsbot</H1>
Since not everyone has a chance to keep up with all the mozilla
<A HREF="http://www.mozilla.org/community.html">news groups</A>,
newsbot is here to collect pointers to some of the more important
announcements, discussions, and goings-<WBR>on.
<P>
When you see an article of interest to the general mozilla community
forward it to <A HREF="mailto:newsbot@mozilla.org">newsbot@mozilla.org</A>
and write a summary of the article. Newsbot will add your summary to this
1999-03-22 03:55:53 +03:00
page and make pointers back to the original article and its thread in DejaNews.
For My Netcape users we also have a
<A HREF="http://www.mozilla.org/my-mozilla.html">channel</A>
for newsbot.
1999-03-20 11:19:28 +03:00
<BLOCKQUOTE><FONT SIZE=-1>
For Netscape Communicator users, this means pressing the <I>Forward</I>
button and writing a summary in the message window. (Forwarding as "quoted"
1999-03-20 11:19:28 +03:00
or "inline" confuses newsbot. Be sure to forward as attachment.) For users
of other clients, the forwarded message should be a multipart MIME message
where the first part is text/plain or text/html and contains your summary,
and the second part is type message/rfc822 and contains the news article.
</FONT></BLOCKQUOTE>
<P>
ENDHEAD
print $header
}
sub printfooter() {
my $footer =<<'ENDFOOT';
<P>
<FONT SIZE=-1>
Send newsbot feedback
to <A HREF="mailto:endico@mozilla.org">Dawn Endico</a>.
</FONT>
</BODY>
</HTML>
ENDFOOT
print $footer;
}