#!/usr/bin/perl
# Ija - Access deja.com from command line.
# Copyright 1999 Asher Blum - licensed under the GPL.
# based on Randal Schwarz's alta script
my $VERSION='0.4.15'; # 13-Aug-2003 - adapt to a google change in results
my $fail = "\nTry the 'v' command to see if the version is current.\n";
use strict;
use LWP::UserAgent;
use URI::Escape; # Escape a string into a URL query
my ( $date,
$subject,
$group,
$author,
$ref, # ref no. (0-n) of current article
$offset, # ref no. of first article in current list
$chunk_size, # how many articles per screen (must match DN)
$verbose, # print diagnostic messages?
$start_url,
$next_msg_url, # url to get next list of articles
$prev_msg_url, # url to get previous list of articles
$page, # the current html document
$page_thresh, # msg size which triggers pager
$pager, # program, e.g. 'less'
$total_hits, # no. hits, including (exactly|about)
@result, # current results list, \0 separated
@article_no, # AN number (big) keyed to ref no. (small)
$QBASE, # base url for query
$query_start, # first part of query
$query_end, # third part of query (after meat)
$aquery_end,
);
$| = 1; # view output incrementally
$chunk_size = 25;
$page_thresh=24;
my $lines = $ENV{ LINES };
if($lines > 5 && ($lines < 120 || $ENV{ TERM } =~ /xterm/)) {
$chunk_size = $lines - 4;
$page_thresh = $lines;
# print"CS = $chunk_size pt = $page_thresh\n";
}
$pager = $ENV{PAGER} || 'less';
$= = $chunk_size+3; #make FORMAT work correctly
$verbose=0;
$QBASE = 'http://groups.google.com/';
$query_start = 'groups?q=';
$query_end = "&num=$chunk_size&btnG=Google+Search&as_oq=&as_epq=&as_eq=&as_ugroup=&as_usubject=&as_uauthors=+&as_umsgid=&lr=";
$query_end = "&btnG=Search&meta=site%3Dgroups&num=$chunk_size";
# author search:
$aquery_end = "&num=$chunk_size&btnG=Google+Search&as_oq=&as_epq=&as_eq=&as_ugroup=&as_usubject=&as_uauthors=+abigail&as_umsgid=&lr=&as_uauthors=+";
format STDOUT_TOP =
Search Results
Ref Date From Newsgroup Subject
--------------------------------------------------------------------------------
.
format STDOUT =
@## @>>>>>>>>>>> @<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<<
$ref,$date, $author, $group, $subject
.
my $first_target = shift;
my $ua = new LWP::UserAgent;
$ua->agent("ija/$VERSION");
$ua->env_proxy() if $ENV{ HTTP_PROXY };
if($first_target) {
my $query = uri_escape($first_target);
$verbose && print"query=$query\n";
if($query =~ /./) {
$start_url="$QBASE$query_start$query$query_end";
&get_results($start_url);
}
$offset=1;
&show_results($offset);
}
# Main Loop:
# Read commands from user.
while(<>) {
chomp;
if (/^n/ && $next_msg_url =~ /./) {
&get_results($next_msg_url);
$offset+=$chunk_size;
&show_results($offset);
}
elsif (/^p/ && $prev_msg_url =~ /./) {
&get_results($prev_msg_url);
$offset-=$chunk_size;
&show_results($offset);
}
elsif (/^l/) {
&show_results($offset);
}
elsif (/^m\s*(\d*)/) {
if(!($1)) {
print"Max hits = $chunk_size\n";
}
elsif ($1 > 100) {
print"Cannot increase max hits above 100.\n";
}
else {
$chunk_size=$1;
# $query_end =~ s/num=\d+//;
# $query_end .= "num=$chunk_size";
$query_end =~ s/num=\d+/num=$chunk_size/;
$aquery_end =~ s/num=\d+/num=$chunk_size/;
$= = $chunk_size+3; #make FORMAT work correctly
}
}
elsif (/^s\s*(.+)/) {
open OUT_ART, ">>$1" or die "Can't open $1 for output: $!";
print OUT_ART $page;
close OUT_ART;
print"Wrote ", length($page), " bytes to $1.\n";
}
elsif (/^t\s*(\d*)/) {
if(!length($1)) {
print"Page threshold = $page_thresh. ".
($page_thresh ? "Set it to 0 to disable paging.\n" : "Paging disabled.\n");
}
else {
$page_thresh = $1;
}
}
elsif (/^r/) {
open(LPR, "|lpr") || die "Can't open pipe to lpr: $!";
print LPR $page;
close LPR;
}
elsif (/^f\s*(.+)/) {
$start_url="$QBASE$query_start".uri_escape($1)."$query_end";
&get_results($start_url);
$offset=1;
&show_results($offset);
}
elsif(/^X/) {
my $new_height = $chunk_size + 4;
print $ENV{ TERM } eq 'xterm' ?
chr(27) . "[8;$new_height;80t" : "Can't: TERM != xterm.";
&show_results($offset);
}
elsif(/^[H\?h]/) { print <<EOH;
a [<name>] Author search.
f <query> Search for a new query string.
h Print help message.
l List current group of messages.
m <1-100> Set max hits returned. (25 normally)
n List next group of messages.
p List previous group of messages.
q Quit.
r Print the current message.
s <file> Save the current message to a file.
t <number> Set the paging threshold - longer articles will be paged.
v Version - check if there's a newer version.
X Size your xterm for the current max hits
<ref> Enter a reference number to view a message.
EOH
}
elsif(/^a\s*(.*)/) {
$start_url="$QBASE$query_start$aquery_end" . uri_escape($1);
&get_results($start_url);
$offset=1;
&show_results($offset);
}
elsif (/^(\d{1,3})/) {
$ref=$1;
if($article_no[$ref]!~/./) {
print"Invalid number\n";
}
else {
&load_article($article_no[$ref]);
}
}
elsif (/^q/) {
exit(0);
}
elsif (/^v/) {
print "Ija version $VERSION - ";
my $c_ver = fetch('http://wildspark.com/asher/ija/version');
chomp $c_ver;
chomp $c_ver;
print $VERSION eq $c_ver ? "you have the latest.\n" :
"$c_ver is the latest.\nhttp://wildspark.com/asher/ija/ija-current.gz\n";
}
else {
print"Invalid command. Type 'h' for help.\n";
}
} # end of main loop
sub author_profile {
my $x_author = shift;
my $r_author ='UNKNOWN'; # author returned by server
my $total_msgs = 0; # num. returned by server
my $s_author = $x_author ? $x_author : $author;
my $url="http://www.deja.com/profile.xp?author=$s_author&ST=PS";
my $page=fetch($url);
my @row = split(/\n\n+/,$page);
for(@row) {
s/\n//g;
/^<.{78,88}>\s*(\d+)<.*>(.+)<.{15,23}$/ &&
printf("%6d %s\n",$1,$2);
/There are (\d+) unique messages by/ && ($total_msgs=$1);
/^<a href="mailto:([^"]+)"/i && ($r_author=$1);
}
$r_author =~ s/"/"/ig;
$r_author =~ s/</</ig;
$r_author =~ s/>/>/ig;
printf("%6d by %s\n",$total_msgs,$r_author);
}
sub load_article {
my $an=shift; #Dejanews article number
my $paging_mode=0;
$verbose && print"******************** Article $an ********************\n";
my $url="http://www.deja.com/getdoc.xp?AN=$an&fmt=text";
my $url = "http://groups.google.com/groups?q=xxxxxxx&start=10&hl=en&lr=&safe=off&rnum=11&seld=$an&ic=1";
$url = "http://groups.google.com/groups?q=xxxxxx&num=25&lr=&safe=off&rnum=1&ic=1&selm=$an";
$page = fetch("$url");
# save for debugging if necessary:
if( -f 'ija-article.html' && open PAGE, '>ija-article.html') {
print PAGE $page;
close PAGE;
}
if($page_thresh && (split("\n",$page) > $page_thresh)) {
open(PAGE,"|$pager") || die"Can't open $pager :$!";
select(PAGE);
$paging_mode=1;
}
my $orig_len = length($page);
my $text_article;
my @el = map unhtml($_), (split /<table[^>]*>/i, $page);
# print"*** $_ : $el[ $_ ]\n" for 0..$#el;
my $header_el = (grep(($el[ $_ ] =~ /\nFrom:/), 0..$#el))[ 0 ]
or die "Header not found in message.$fail";
# print"Header el = $header_el\n";
$el[ $header_el + 1 ] =~ s/ This is the only article in this thread //;
$el[ $header_el + 1 ] =~ s/\s*View complete thread\s*//;
$page = $el[ $header_el ] . $el[ $header_el + 1 ]; # the meat is in this element - don't need the rest
$page =~ s/$_/\n$_/ for qw( Subject: Date: );
$page =~ s/(Date:.\S+ \S+ \S+)/$1\n\n/;
$page =~ s/View: Original Format//;
$page =~ s/^(From:.*)\((\S+\@\S+)\)/$1<$2>/m;
$page =~ s/^Search result \d+\n//i;
$page =~ s/\[view thread\]//i;
$page =~ s/\s+$//;
$page =~ s/\n\n\n/\n\n/;
$page =~ s/View:.*Original Format.*//;
# $page =~ s/^(From:.*)/XXXXXXXXX/m;
print "-" x 70, "\n";
# print"\n--- $_ ---\n$el[$_]\n" for (0..$#el);
# Calculate the 'compression' - ratio of message bytes to web page bytes:
my $new_len = length($page);
my $percent_compress = int(100 * $new_len / $orig_len);
print $page;
#print "\n[Ref:$ref AN:$an % txt = $percent_compress% = $new_len / $orig_len]\n";
print "\n[Ref:$ref AN:$an $percent_compress% txt]\n";
if ($page =~ /^From:\s*([^\n]+)\n/) {
$author=$1;
}
if($paging_mode) {
select(STDOUT);
close PAGE;
}
}
sub show_results {
my $start=shift;
unless($total_hits) {
print"No match\n";
return 0;
}
for my $r($start..$start+$chunk_size-1) {
($date,$author,$group,$subject) = split(/\0/,$result[$r]);
$ref=$r; #Can't use "my" variable as for index
write;
}
print"Results $offset through ",$offset+$chunk_size-1," of $total_hits\n";
$verbose && print"Next=$next_msg_url\nPrevious=$prev_msg_url\n";
}
sub get_results {
my $url=shift;
my $page;
my $an;
$next_msg_url='';
$prev_msg_url='';
@result={};
@article_no={};
$page = &fetch($url);
# Save a copy of the page for debugging:
if( -f 'deja.html' && open PAGE, '>deja.html') {
print PAGE $page;
close PAGE;
}
$page =~ s/\n//g;
$page =~ s|</?b>||gi;
$page =~ s/<br>//gis;
if($page =~ /Results (\d+) - (\d+) of.*?([\d\,]+)/) {
$ref = $1;
$total_hits = $3;
}
elsif($page =~ /did not match any documents/) {
$total_hits = 0;
return 0;
}
else {
die "Results tag not found.$fail";
}
while($page =~ m|
<a\s+href=[^>]+selm= ([^>]+) # Article Number
[^>]*> ([^<]+) # subject
.*?
<a[^>]+group=[^>]+> ([^<]+) # group
</a>\s+\-\s+ (\w\w\w\.?\s+\d{1,2}\,\s+\d\d\d\d) # date
\s+by\s+ ([^<]+) # author
|igxs) {
($an, $subject, $group, $date, $author) = ($1, $2, $3, $4, $5);
for($date, $an, $ref, $subject, $group, $author) {
s/^\s+//;
s/\s+$//;
}
# $date =~ s|\d\d(\d\d)$|$1|; # Year yyyy -> yy
# $date =~ s/ /-/g;
$date =~ s/(\w\w\w)\. (\d{1,2})\, \d\d(\d\d)/$2.$1.$3/;
$article_no[$ref]=$an;
$result[$ref]=join("\0",$date,$author,$group,$subject);
# print "** $result[$ref]\n";
$ref ++;
}
$next_msg_url = abs_url($1) if $page =~ /<a\s+href=([^>]+)>(<[^>]+>\s*)*Next</i;
$prev_msg_url = abs_url($1) if $page =~ /<a\s+href=([^>]+)>(<[^>]+>\s*)*Previous</i;
}
sub unhtml {
my $p = shift;
$p =~ s/<[^>]+>//g;
$p =~ s/>/>/gi;
$p =~ s/</</gi;
$p =~ s/"/"/gi;
$p =~ s/&/&/gi;
$p;
}
sub abs_url {
my $url = shift;
return $url if $url =~ /^http/;
$url =~ s|^/||;
$QBASE . $url;
}
sub fetch {
my $url = shift;
my $request = new HTTP::Request('GET', $url);
my $response = $ua->request($request);
die "$url failed: ",$response->error_as_HTML,$fail
unless $response->is_success;
$response->content;
}
syntax highlighted by Code2HTML, v. 0.9.1