#!/usr/local/bin/perl -w

use strict;
use Algorithm::Cluster qw/kcluster/;

my $file = "US_data/US.txt";
my $i = 0;
my $j = 0;
my (@name,@data,@weight,@mask);

open(DATA,"<$file") or die "Can't open file $file: $!";


#------------------
# Read in the data file, and save the data to @orfdata
# We know that the file is intact and has no holes, 
# so just set the mask to 1 for every item.
# We don't check for errors in this case, because the file
# is short and we can spot errors by eye. 
#
#my $firstline = <DATA>;  # Skip the title line
while(<DATA>) {
#    s/^\s//;
    chomp(my $line = $_);
    my @field     = split /\;/, $line;
    $name[$i]  =   $field[0]."(".$field[7].")";
    $data[$i]  = [ @field[1..2] ];
    $mask[$i]     = [ 1,1 ];
    ++$i;
}
close(DATA);

#------------------
# Make a reverse-lookup index of the @orfnames hash:
#
my %name_by_rowid;
$i=0;
$name_by_rowid{$i++} = $_, foreach(@name);

@weight = (1.0) x 2;


#------------------
# Define the params we want to pass to kcluster
my %params = (
    nclusters =>         49,
    transpose =>         0,
    npass     =>       1000,
    method    =>       'a',
    dist      =>       'e',
    data      =>    \@data,
    mask      =>       \@mask,
    weight    =>     \@weight,
);


#------------------
# Here is where we invoke the library function!
#
my ($clusters, $error, $found) = kcluster(%params);
#
#------------------



#------------------
# Create a reverse index of the ORF names,  by cluster ID
#
my %name_by_cluster;
$i=0;
foreach(@{$clusters}) {
    push @{$name_by_cluster{$_}}, $name_by_rowid{$i++};
}


#------------------
# Print out a list of the ORFs, grouped by cluster ID,
# as returned by the kcluster() function.
#
for ($i = 0; $i < $params{"nclusters"}; $i++) {
    print "------------------\n";
    printf("Cluster %d:  %d puntos\n\n",
           $i, scalar(@{$name_by_cluster{$i} })
    );

    print "$_\n", foreach( sort { $a cmp $b } @{$name_by_cluster{$i} } );
    print "\n";
}


#------------------
# Print out the resulting within-cluster sum of distances.
#
print "------------------\n";
printf("Within-cluster sum of distances:  %f\n\n", $error); 

exit 1;

__END__
  
Este módulo en Perl y C es muy eficiente. Probado con 189900 poblaciones,
  sólo tardó unos 5 días en producir 80 clusters coherentes.
  
