#!/usr/local/bin/perl -w use strict; use Algorithm::Cluster qw/kcluster/; my $file = "US_data/US.txt"; my $i = 0; my $j = 0; my (@name,@data,@weight,@mask); open(DATA,"<$file") or die "Can't open file $file: $!"; #------------------ # Read in the data file, and save the data to @orfdata # We know that the file is intact and has no holes, # so just set the mask to 1 for every item. # We don't check for errors in this case, because the file # is short and we can spot errors by eye. # #my $firstline = ; # Skip the title line while() { # s/^\s//; chomp(my $line = $_); my @field = split /\;/, $line; $name[$i] = $field[0]."(".$field[7].")"; $data[$i] = [ @field[1..2] ]; $mask[$i] = [ 1,1 ]; ++$i; } close(DATA); #------------------ # Make a reverse-lookup index of the @orfnames hash: # my %name_by_rowid; $i=0; $name_by_rowid{$i++} = $_, foreach(@name); @weight = (1.0) x 2; #------------------ # Define the params we want to pass to kcluster my %params = ( nclusters => 49, transpose => 0, npass => 1000, method => 'a', dist => 'e', data => \@data, mask => \@mask, weight => \@weight, ); #------------------ # Here is where we invoke the library function! # my ($clusters, $error, $found) = kcluster(%params); # #------------------ #------------------ # Create a reverse index of the ORF names, by cluster ID # my %name_by_cluster; $i=0; foreach(@{$clusters}) { push @{$name_by_cluster{$_}}, $name_by_rowid{$i++}; } #------------------ # Print out a list of the ORFs, grouped by cluster ID, # as returned by the kcluster() function. # for ($i = 0; $i < $params{"nclusters"}; $i++) { print "------------------\n"; printf("Cluster %d: %d puntos\n\n", $i, scalar(@{$name_by_cluster{$i} }) ); print "$_\n", foreach( sort { $a cmp $b } @{$name_by_cluster{$i} } ); print "\n"; } #------------------ # Print out the resulting within-cluster sum of distances. # print "------------------\n"; printf("Within-cluster sum of distances: %f\n\n", $error); exit 1; __END__ Este módulo en Perl y C es muy eficiente. Probado con 189900 poblaciones, sólo tardó unos 5 días en producir 80 clusters coherentes.