#!/usr/bin/perl -w use 5.032; # local $/ = undef; my @cont; my @dest; my $content; my $file = $ARGV[0] // "AMillionRandomDigits.bin"; if (-s $file > 16777216) { die "Tamaño no soportado"; } open IN, '<' , $file or die $!; while (){ $content .= $_; } close IN; my $l0 = length($content); say "origen $l0"; # my $pos = 0; my $DEBUG = 1; my $SIZE = 20; # importante my %hc; my %ho; # my $s; my $copia = $content; while (my $chunk = substr ($content, 0, $SIZE, '')){ # extraigo 5*8 = 40 bits $hc{$chunk}++; if (length($chunk) < $SIZE){ $hc{$chunk} = 0; # esto es importante para que el cacho del final, que es más corto, quede siempre } # en último lugar de la ordenación. Si no, corrompe las cadenas } # for my $i (keys %hc){ # $s += $hc{$i}; # } # say "$s chunks" if $DEBUG; my @odict = sort { $hc{$b} <=> $hc{$a} } keys %hc; # DICCIONARIO PRINCIPAL open OUT, ">", "millionbits$SIZE.dict" or die $!; for (@odict){ print OUT $_; } close OUT; say scalar(@odict), " number of ordered chunks" if $DEBUG; ### compresión my $kounter = 0; for (@odict){ # say bin($kounter) if $DEBUG; # $ho{$_} = pack "L", ($kounter); $ho{$_} = part($kounter); # 2 niveles de compresión: en %ho no están todos los chunk, sino solo los que no se repiten $kounter++; # el segundo nivel es que el binario asociado sea más corto para los más frecuentes. } open OUT, ">", "millionbits$SIZE.comp" or die $!; while (my $chunk = substr ($copia, 0, $SIZE, '')){ print OUT $ho{$chunk}; # escribo números de 24 bits en caractéres } close OUT; ### descompresión use Fcntl; my @dict; my $bytes; my $k = 0; sysopen IN, "millionbits$SIZE.dict", O_RDONLY or die $!; do { sysread IN, $bytes, $SIZE; $dict[$k] = $bytes; $k++; }until (length($bytes) < $SIZE); close IN; open IN, "<", "millionbits$SIZE.comp" or die $!; my $comp; while (){ $comp .= $_; } close IN; my $num; open OUT, ">", "millionbits$SIZE" or die $!; while (my $chunk = substr ($comp, 0, 3, '')){ # debo leer números de 24 bits (3*8 = 24) # $num = unpack "L" , $chunk; if (length($chunk)<3){ $chunk = "\0" x (3-length($chunk)) . $chunk; } $num = unpart($chunk); print OUT $dict[$num]; } close OUT; if ($DEBUG){ say "OK" if (1230 == unpart(part(1230))); } exit 1; sub bin{ # esto debe sustituir a pack/unpack "L" que es 4 bytes, por algo de solo 3 bytes my $n = shift; # posteriormente en caractéres pack "CCC" my $bin = ""; for (0..23){ if ($n & 2**$_){ $bin .= "0"; }else{ $bin .= "1"; } last if $n <= 2**$_; } return $bin; } sub part { my $n = shift; my $n1 = int ($n / (256**2)); my $r1 = $n - $n1*(256**2); die "Número desbocado" if $n1 > 255; my $n2 = int ($r1 / 256); my $r2 = $n -$n1*256**2 - $n2*256; my $n3 = $r2 / 1 ; return chr($n1).chr($n2).chr($n3); } sub unpart { my $s = shift; my $n1 = 256**2 * ord(substr ($s, 0, 1)); my $n2 = 256 * ord(substr ($s, 1, 1)); my $n3 = ord(substr ($s, 2, 1)); return $n1+$n2+$n3; }