As some of you know, I use Hamster, a free news server with many other options. I also use Hamster to download all postings on different groups on Yahoo. This is great when there is so much traffic in a group and so much I would like to keep. It fetches the postings and puts them nicely into a local newsgroup here on my computer so that I can use my normal news reader to read everything. It sorts them into subjects and all, and I can do the much more likable searches that I can do from my news reader. If you ever have tried searching for a post in groups.yahoo you know what I mean.
Here is the Perl script I use to download into my news server:
#!/usr/local/bin/perl
# $Header: E:/RCS/yahoo.pl 1.4 2001/05/16 21:14:34 prupe Exp prupe $
use HTML::Entities;
use HTTP::Request;
use HTTP::Headers;
use LWP::UserAgent;
use Win32::OLE;
sub logf (@)
{
return if (!$opt{debug});
print STDERR "---Main: ";
printf STDERR @_;
print STDERR "
";
}
sub get_url ($)
{
my $url = $_[0];
my ($host, $h, $req, $ua);
$host = $url;
$host =~ s,^w+://,,;
$host =~ s,/.*,,;
$h = HTTP::Headers->new;
$h->user_agent($USERAGENT);
$h->referer("http://$host/");
$h->header('Host' => $host);
$h->header('Connection' => 'close');
$h->header('Accept' => '*/*');
$h->header('Accept-Lanugage' => 'en');
$h->header('Cookie' => $COOKIE);
print ($h->as_string) if ($opt{debug});
$req = HTTP::Request->new('GET' => $url, $h);
$ua = LWP::UserAgent->new;
return $ua->request($req);
}
%opt = ( );
foreach $a (@ARGV)
{
($opt{debug} = 1, next) if ($a eq "-l");
($STARTINGNUM = 0 + $a, next) if (!$STARTINGNUM);
($ENDINGNUM = 0 + $a, next) if (!$ENDINGNUM);
}
$USERAGENT = 'SpaceBison/0.01 [fu] (Win67; X; ShonenKnife)';
$URLPATTERN = '
http://groups.yahoo.com/group/prox-list/message/%d?source=1';$STARTINGNUM = 1 if (!$STARTINGNUM);
$ENDINGNUM = 6436 if (!$ENDINGNUM);
# !!!! IMPORTANT !!!!
#
# This script uses several Perl 5 modules. Make sure they are in your Perl
# library path. SET PERL5LIB=E:PERLLIB;E:PERLLIBPOD works for me.
# After that, the command
# perl yahoo.pl <starting num> <ending num>
# should work. Messages that Hamster fails to import for some reason will be
# dumped to failed.txt in the current directory.
#
# Edit the following lines before running this script or it will not work.
# $COOKIE: Use your Yahoo login cookie here. To find this go to
#
http://groups.yahoo.com/group/prox-list/ with Prox's log window open.
# Look for the line in the outgoing headers that starts with "Cookie:".
# Copy the entire line except for the text "Cookie: " to the variable
# assignment below.
# $NEWSGROUP: Whatever newsgroup in Hamster you want to import the messages
# into. I use pr.proxomitron.archive, but this is arbitrary.
$COOKIE = 'HERE';
$NEWSGROUP = 'pr.proxomitron.archive';
';
$COOKIE =~ s/^Cookie:s*//;
if (!$COOKIE)
{
print "You must set the variable $COOKIE in $0
";
print "See the section labeled IMPORTANT
";
exit 1;
}
$host = $URLPATTERN;
$host =~ s,^w+://,,;
$host =~ s,/.*,,;
$hamster = Win32::OLE->new("Hamster.App");
if (!$hamster)
{
print "Could not connect to Hamster. Ensure that it is running.
";
exit 1;
}
for ($n = $STARTINGNUM; $n <= $ENDINGNUM; $n++)
{
$url = sprintf $URLPATTERN, $n;
print "Getting message #$n
";
$res = get_url($url);
if (!$res->is_success)
{
print STDERR $res->error_as_HTML;
next;
}
$txt = $res->content;
$txt .= "";
logf 'Read %d bytes', length $txt;
logf '';
@l = split /<pre>/, $txt;
if (!$l[1])
{
print STDERR "---<pre> match failed (#$n) Either this message no longer exists or the login cookie is incorrect.
";
next;
}
$txt = $l[1];
@l = split /</pre>/, $txt;
if (!$l[1])
{
print STDERR "---</pre> match failed (#$n)
";
next;
}
$txt = $l[0];
undef @l;
$txt =~ s/<[^>]*>//g;
HTML::Entities::decode($txt);
$txt =~ s///g;
$txt =~ s/^From[^
]+
//g;
for ($i = 0; $i < length $txt; $i++)
{
if (substr($txt, $i, 2) eq "
")
{
$hdr = substr($txt, 0, $i + 1);
$bdy = substr($txt, $i + 2);
last;
}
}
$hdr =~ s/...>/.$host>/g;
$hdr =~ s/
Subject:([^
]*)[prox(-list)?]s*([^
]+)
/
Subject:$1$3
/;
$hdr .= "Newsgroups: $NEWSGROUP
";
$hdr .= "Xref: $host $NEWSGROUP:$n
";
$hdr .= "Path: $host!not-for-mail
";
$hdr .= "X-Source-URL: $url
";
$b = $bdy;
$b =~ s/[^
]//g;
$hdr .= sprintf "Lines: \%d
", length $b;
$art = "$hdr
$bdy";
$art =~ s/
/
/g;
if (!$hamster->NewsImport($art, "", 0, 0))
{
print STDERR "---NewsImport failed
";
if (open OUT, ">> failed.txt")
{
syswrite OUT, $art, length $art;
close OUT;
}
}
}
undef $hamster;
Best wishes
Arne
Imici username: Arne