ports//news/ija/work/ija

#!/usr/bin/perl
# Ija - Access deja.com from command line.
# Copyright 1999 Asher Blum - licensed under the GPL.
# based on Randal Schwarz's alta script

my $VERSION='0.4.15'; # 13-Aug-2003 - adapt to a google change in results

my $fail = "\nTry the 'v' command to see if the version is current.\n";

use strict;
use LWP::UserAgent;
use URI::Escape; # Escape a string into a URL query

my (        $date,
            $subject,
            $group,
            $author,
            $ref,                 # ref no. (0-n) of current article
            $offset,              # ref no. of first article in current list
            $chunk_size,          # how many articles per screen (must match DN)
            $verbose,             # print diagnostic messages?
            $start_url,           
            $next_msg_url,        # url to get next list of articles
            $prev_msg_url,        # url to get previous list of articles
            $page,                # the current html document
            $page_thresh,         # msg size which triggers pager
            $pager,               # program, e.g. 'less'
            $total_hits,          # no. hits, including (exactly|about)
            @result,              # current results list, \0 separated
            @article_no,          # AN number (big) keyed to ref no. (small)
            $QBASE,               # base url for query
            $query_start,         # first part of query
            $query_end,           # third part of query (after meat)
            $aquery_end,
            );
$| = 1; # view output incrementally
$chunk_size = 25;
$page_thresh=24;

my $lines = $ENV{ LINES };
if($lines > 5 && ($lines < 120 || $ENV{ TERM } =~ /xterm/)) {
    $chunk_size = $lines - 4;
    $page_thresh = $lines;
    # print"CS = $chunk_size   pt = $page_thresh\n";
}

$pager = $ENV{PAGER} || 'less';
$= = $chunk_size+3; #make FORMAT work correctly
$verbose=0;
$QBASE = 'http://groups.google.com/';
$query_start = 'groups?q=';
$query_end = "&num=$chunk_size&btnG=Google+Search&as_oq=&as_epq=&as_eq=&as_ugroup=&as_usubject=&as_uauthors=+&as_umsgid=&lr=";

$query_end = "&btnG=Search&meta=site%3Dgroups&num=$chunk_size";
# author search:
$aquery_end = "&num=$chunk_size&btnG=Google+Search&as_oq=&as_epq=&as_eq=&as_ugroup=&as_usubject=&as_uauthors=+abigail&as_umsgid=&lr=&as_uauthors=+";
format STDOUT_TOP = 
                                 Search Results
Ref Date     From           Newsgroup              Subject
--------------------------------------------------------------------------------
.
format STDOUT = 
@## @>>>>>>>>>>> @<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<<
$ref,$date,  $author,       $group,                $subject
.

my $first_target = shift;
my $ua = new LWP::UserAgent;
$ua->agent("ija/$VERSION");
$ua->env_proxy() if $ENV{ HTTP_PROXY };

if($first_target) {
    my $query = uri_escape($first_target);
    $verbose && print"query=$query\n";
    if($query =~ /./) {
     $start_url="$QBASE$query_start$query$query_end";
     &get_results($start_url);
    }

    $offset=1;
    &show_results($offset);
}

# Main Loop:
# Read commands from user.
while(<>) {
 chomp;
 if (/^n/ && $next_msg_url =~ /./) {
  &get_results($next_msg_url);
  $offset+=$chunk_size;
  &show_results($offset);
  }
 elsif (/^p/ && $prev_msg_url =~ /./) {
  &get_results($prev_msg_url);
  $offset-=$chunk_size;
  &show_results($offset);
 }
 elsif (/^l/) {
  &show_results($offset);
 }
 elsif (/^m\s*(\d*)/) {
  if(!($1)) {
   print"Max hits = $chunk_size\n";
  }
  elsif ($1 > 100) {
   print"Cannot increase max hits above 100.\n";
  }
  else {
   $chunk_size=$1;
#   $query_end =~ s/num=\d+//;
#   $query_end .= "num=$chunk_size";
   $query_end =~ s/num=\d+/num=$chunk_size/;
   $aquery_end =~ s/num=\d+/num=$chunk_size/;
   $= = $chunk_size+3; #make FORMAT work correctly
  }
 }   
 elsif (/^s\s*(.+)/) {
  open OUT_ART, ">>$1" or die "Can't open $1 for output: $!";
  print OUT_ART $page;
  close OUT_ART;
  print"Wrote ", length($page), " bytes to $1.\n";
 }
 elsif (/^t\s*(\d*)/) {
  if(!length($1)) {
   print"Page threshold = $page_thresh.  ".
    ($page_thresh ? "Set it to 0 to disable paging.\n" : "Paging disabled.\n");
  }
  else {
   $page_thresh = $1;
  } 
 }
 elsif (/^r/) {
  open(LPR, "|lpr") || die "Can't open pipe to lpr: $!";
  print LPR $page;
  close LPR;
 }
 elsif (/^f\s*(.+)/) {
 $start_url="$QBASE$query_start".uri_escape($1)."$query_end";
 &get_results($start_url);
 $offset=1;
 &show_results($offset);
 }  
 elsif(/^X/) {
  my $new_height = $chunk_size + 4;
  print $ENV{ TERM } eq 'xterm' ?
   chr(27) . "[8;$new_height;80t" : "Can't: TERM != xterm.";
   &show_results($offset);
 }
 elsif(/^[H\?h]/) { print <<EOH;
a [<name>] Author search.
f <query>  Search for a new query string.
h          Print help message.
l          List current group of messages.
m <1-100>  Set max hits returned. (25 normally)
n          List next group of messages.
p          List previous group of messages.
q          Quit.
r          Print the current message.
s <file>   Save the current message to a file.
t <number> Set the paging threshold - longer articles will be paged.
v          Version - check if there's a newer version.
X          Size your xterm for the current max hits
<ref>      Enter a reference number to view a message.

EOH
 }
 elsif(/^a\s*(.*)/) {
 $start_url="$QBASE$query_start$aquery_end" . uri_escape($1);
 &get_results($start_url);
 $offset=1;
 &show_results($offset);
 }
 elsif (/^(\d{1,3})/) {
  $ref=$1;
  if($article_no[$ref]!~/./) {
   print"Invalid number\n";
  }
  else {
   &load_article($article_no[$ref]);
  }
 } 
 elsif (/^q/) {
  exit(0);
 }
 elsif (/^v/) {
  print "Ija version $VERSION - ";
  my $c_ver = fetch('http://wildspark.com/asher/ija/version');
  chomp $c_ver;
  chomp $c_ver;
  print $VERSION eq $c_ver ? "you have the latest.\n" :
    "$c_ver is the latest.\nhttp://wildspark.com/asher/ija/ija-current.gz\n";
 }
 else {
  print"Invalid command.  Type 'h' for help.\n";
 }
} # end of main loop

sub author_profile {
 my $x_author = shift;
 my $r_author ='UNKNOWN'; # author returned by server
 my $total_msgs = 0; # num. returned by server
 my $s_author = $x_author ? $x_author : $author;
 my $url="http://www.deja.com/profile.xp?author=$s_author&ST=PS";
 my $page=fetch($url);
 my @row = split(/\n\n+/,$page);
 for(@row) {
  s/\n//g;
  /^<.{78,88}>\s*(\d+)<.*>(.+)<.{15,23}$/ &&
   printf("%6d %s\n",$1,$2);
  /There are (\d+) unique messages by/ && ($total_msgs=$1);
  /^<a href="mailto:([^"]+)"/i && ($r_author=$1);
 }
 $r_author =~ s/&quot;/"/ig;
 $r_author =~ s/&lt;/</ig;
 $r_author =~ s/&gt;/>/ig;
 printf("%6d by %s\n",$total_msgs,$r_author);
}
sub load_article {
 my $an=shift; #Dejanews article number
 my $paging_mode=0;
 $verbose && print"******************** Article $an ********************\n";
 my $url="http://www.deja.com/getdoc.xp?AN=$an&fmt=text";
 my $url = "http://groups.google.com/groups?q=xxxxxxx&start=10&hl=en&lr=&safe=off&rnum=11&seld=$an&ic=1";
$url = "http://groups.google.com/groups?q=xxxxxx&num=25&lr=&safe=off&rnum=1&ic=1&selm=$an";
 $page = fetch("$url");

 # save for debugging if necessary:

 if( -f 'ija-article.html' && open PAGE, '>ija-article.html') {
     print PAGE $page;
     close PAGE;
 }

 if($page_thresh && (split("\n",$page) > $page_thresh)) {
  open(PAGE,"|$pager") || die"Can't open $pager :$!";
  select(PAGE);
  $paging_mode=1;
 }

 my $orig_len = length($page);
 my $text_article;
 my @el = map unhtml($_), (split /<table[^>]*>/i, $page);

 # print"*** $_ :  $el[ $_ ]\n" for 0..$#el;

 my $header_el = (grep(($el[ $_ ] =~ /\nFrom:/), 0..$#el))[ 0 ]
   or die "Header not found in message.$fail";

 # print"Header el = $header_el\n";

 $el[ $header_el + 1 ] =~ s/ This is the only article in this thread //;
 $el[ $header_el + 1 ] =~ s/\s*View complete thread\s*//;
 $page = $el[ $header_el ] . $el[ $header_el + 1 ]; # the meat is in this element - don't need the rest
 $page =~ s/$_/\n$_/ for qw( Subject: Date: );
 $page =~ s/(Date:.\S+ \S+ \S+)/$1\n\n/;
 $page =~ s/View: Original Format//;
 $page =~ s/^(From:.*)\((\S+\@\S+)\)/$1<$2>/m;
 $page =~ s/^Search result \d+\n//i;
 $page =~ s/\[view thread\]//i;
 $page =~ s/\s+$//;
 $page =~ s/\n\n\n/\n\n/;
 $page =~ s/View:.*Original Format.*//;
# $page =~ s/^(From:.*)/XXXXXXXXX/m;
  
 print "-" x 70, "\n";
 #  print"\n--- $_ ---\n$el[$_]\n" for (0..$#el);

 # Calculate the 'compression' - ratio of message bytes to web page bytes:
 my $new_len = length($page);
 my $percent_compress = int(100 * $new_len / $orig_len);

 print $page;
 #print "\n[Ref:$ref AN:$an % txt = $percent_compress% = $new_len / $orig_len]\n";
 print "\n[Ref:$ref AN:$an  $percent_compress% txt]\n";
 if ($page =~ /^From:\s*([^\n]+)\n/) {
  $author=$1;
 }
 if($paging_mode) {
  select(STDOUT);
  close PAGE;
 } 
} 
sub show_results {
 my $start=shift;
 unless($total_hits) {
  print"No match\n";
  return 0;
 }
 for my $r($start..$start+$chunk_size-1) {
  ($date,$author,$group,$subject) = split(/\0/,$result[$r]);
  $ref=$r; #Can't use "my" variable as for index
  write;
 }
 print"Results $offset through ",$offset+$chunk_size-1," of $total_hits\n";
 $verbose && print"Next=$next_msg_url\nPrevious=$prev_msg_url\n";
}
sub get_results {
my $url=shift;
my $page;
my $an;
$next_msg_url='';
$prev_msg_url='';
@result={};
@article_no={};
$page = &fetch($url);

# Save a copy of the page for debugging:
if( -f 'deja.html' && open PAGE, '>deja.html') {
    print PAGE $page;
    close PAGE;
}

$page =~ s/\n//g;
$page =~ s|</?b>||gi;
$page =~ s/<br>//gis;

if($page =~ /Results (\d+) - (\d+) of.*?([\d\,]+)/) {
    $ref = $1;
    $total_hits = $3;
}
elsif($page =~ /did not match any documents/) {
    $total_hits = 0;
    return 0;
}
else {
    die "Results tag not found.$fail";
}


while($page =~ m|
    <a\s+href=[^>]+selm=    ([^>]+)   # Article Number
    [^>]*>                  ([^<]+)   # subject
    .*?      
    <a[^>]+group=[^>]+>     ([^<]+)   # group
    </a>\s+\-\s+            (\w\w\w\.?\s+\d{1,2}\,\s+\d\d\d\d) # date
    \s+by\s+                ([^<]+)   # author
    |igxs) {
($an, $subject, $group, $date, $author) = ($1, $2, $3, $4, $5);

 for($date, $an, $ref, $subject, $group, $author) {
  s/^\s+//;
  s/\s+$//;
 }
# $date =~ s|\d\d(\d\d)$|$1|; # Year yyyy -> yy
# $date =~ s/ /-/g;
 $date =~ s/(\w\w\w)\. (\d{1,2})\, \d\d(\d\d)/$2.$1.$3/;
 $article_no[$ref]=$an;
 $result[$ref]=join("\0",$date,$author,$group,$subject);
 # print "** $result[$ref]\n";
 $ref ++;
}

 $next_msg_url = abs_url($1) if $page =~ /<a\s+href=([^>]+)>(<[^>]+>\s*)*Next</i;
 $prev_msg_url = abs_url($1) if $page =~ /<a\s+href=([^>]+)>(<[^>]+>\s*)*Previous</i;

}

sub unhtml {
    my $p = shift;
    $p =~ s/<[^>]+>//g;
    $p =~ s/&gt;/>/gi;
    $p =~ s/&lt;/</gi;
    $p =~ s/&quot;/"/gi;
    $p =~ s/&amp;/&/gi;
    $p;
}
sub abs_url {
    my $url = shift;
    return $url if $url =~ /^http/;
    $url =~ s|^/||;
    $QBASE . $url;
}
sub fetch {
  my $url = shift;
  my $request = new HTTP::Request('GET', $url);
  my $response = $ua->request($request);
  die "$url failed: ",$response->error_as_HTML,$fail
    unless $response->is_success;
  $response->content;
}
syntax highlighted by Code2HTML, v. 0.9.1