#!/usr/perl/perl580/bin/perl -w use strict; use Algorithm::Cluster qw/kcluster/; my $file = "../../data/cyano.txt"; my $i = 0; my $j = 0; my (@orfname,@orfdata,@weight,@mask); open(DATA,"<$file") or die "Can't open file $file: $!"; #------------------ # Read in the data file, and save the data to @orfdata # We know that the file is intact and has no holes, # so just set the mask to 1 for every item. # We don't check for errors in this case, because the file # is short and we can spot errors by eye. # my $firstline = ; # Skip the title line while() { chomp(my $line = $_); my @field = split /\t/, $line; $orfname[$i] = $field[0]; $orfdata[$i] = [ @field[2..5] ]; $mask[$i] = [ 1,1,1,1 ]; ++$i; } close(DATA); #------------------ # Make a reverse-lookup index of the @orfnames hash: # my %orfname_by_rowid; $i=0; $orfname_by_rowid{$i++} = $_, foreach(@orfname); @weight = (1.0) x 4; #------------------ # Define the params we want to pass to kcluster my %params = ( nclusters => 6, transpose => 0, npass => 100, method => 'a', dist => 'e', data => \@orfdata, mask => \@mask, weight => \@weight, ); #------------------ # Here is where we invoke the library function! # my ($clusters, $error, $found) = kcluster(%params); # #------------------ #------------------ # Create a reverse index of the ORF names, by cluster ID # my %orfname_by_cluster; $i=0; foreach(@{$clusters}) { push @{$orfname_by_cluster{$_}}, $orfname_by_rowid{$i++}; } #------------------ # Print out a list of the ORFs, grouped by cluster ID, # as returned by the kcluster() function. # for ($i = 0; $i < $params{"nclusters"}; $i++) { print "------------------\n"; printf("Cluster %d: %d ORFs\n\n", $i, scalar(@{$orfname_by_cluster{$i} }) ); print "\t$_\n", foreach( sort { $a cmp $b } @{$orfname_by_cluster{$i} } ); print "\n"; } #------------------ # Print out the resulting within-cluster sum of distances. # print "------------------\n"; printf("Within-cluster sum of distances: %f\n\n", $error); #------------------ # Try this again with a specified initial clustering solution # my @initialid = (0,1,2,3,4,5) x 15; # choice for the initial clustering; the data file contains 90 genes. %params = ( nclusters => 6, transpose => 0, method => 'a', dist => 'e', data => \@orfdata, mask => \@mask, weight => \@weight, initialid => \@initialid, ); printf("Executing k-means clustering with a specified initial clustering\n"); ($clusters, $error, $found) = kcluster(%params); printf("Within-cluster sum of distances: %f\n\n", $error);