#!/usr/bin/perl -w

use 5.032;

# local $/ = undef;

my @cont;
my @dest;
my $content;
my $file = $ARGV[0] // "AMillionRandomDigits.bin";
if (-s $file > 16777216) { die "Tamaño no soportado"; }
open IN, '<' , $file or die $!;
while (<IN>){
    $content .= $_;
}
close IN;

my $l0 = length($content);

say "origen  $l0";


# my $pos = 0;
my $DEBUG = 1;
my $SIZE = 20;                              # importante
my %hc;
my %ho;
# my $s;
my $copia = $content;
while (my $chunk = substr ($content, 0, $SIZE, '')){           # extraigo 5*8 = 40 bits
    $hc{$chunk}++;
    if (length($chunk) < $SIZE){
	$hc{$chunk} = 0;                         # esto es importante para que el cacho del final, que es más corto, quede siempre
    }                                            # en último lugar de la ordenación. Si no, corrompe las cadenas
}
# for my $i (keys %hc){
#    $s += $hc{$i};
# }
# say "$s chunks" if $DEBUG;
my @odict = sort { $hc{$b} <=> $hc{$a} } keys %hc;                      # DICCIONARIO PRINCIPAL
open OUT, ">", "millionbits$SIZE.dict" or die $!;
for (@odict){
    print OUT $_;
}
close OUT;
say scalar(@odict), " number of ordered chunks" if $DEBUG; 


### compresión

my $kounter = 0;
for (@odict){
#    say bin($kounter) if $DEBUG;
#    $ho{$_} = pack "L", ($kounter);
    $ho{$_} = part($kounter);                        # 2 niveles de compresión: en %ho no están todos los chunk, sino solo los que no se repiten
    $kounter++;                                      # el segundo nivel es que el binario asociado sea más corto para los más frecuentes.
}
open OUT, ">", "millionbits$SIZE.comp" or die $!;
while (my $chunk = substr ($copia, 0, $SIZE, '')){
    print OUT $ho{$chunk};                                 # escribo números de 24 bits en caractéres
}
close OUT;


### descompresión

use Fcntl;

my @dict;
my $bytes;
my $k = 0;
sysopen IN, "millionbits$SIZE.dict", O_RDONLY or die $!;
do {
    sysread IN, $bytes, $SIZE;
    $dict[$k] = $bytes;
    $k++;
}until (length($bytes) < $SIZE);
close IN;

open IN, "<", "millionbits$SIZE.comp" or die $!;
my $comp;
while (<IN>){
    $comp .= $_;
}
close IN;

my $num;
open OUT, ">", "millionbits$SIZE" or die $!;
while (my $chunk = substr ($comp, 0, 3, '')){                 # debo leer números de 24 bits  (3*8 = 24)
#    $num = unpack "L" , $chunk;
    if (length($chunk)<3){
	$chunk = "\0" x (3-length($chunk)) . $chunk;
    }
    $num = unpart($chunk);
    print OUT $dict[$num];
}
close OUT;

if ($DEBUG){
    say "OK" if (1230 == unpart(part(1230)));
}
    
exit 1;

sub bin{                                        # esto debe sustituir a pack/unpack "L" que es 4 bytes, por algo de solo 3 bytes
    my $n = shift;                              # posteriormente en caractéres pack "CCC"
    my $bin = "";
    for (0..23){
	if ($n & 2**$_){
	    $bin .= "0";
	}else{
	    $bin .= "1";
	}
	last if $n <= 2**$_;
    }
    return $bin;
}

sub part {
    my $n = shift;
    my $n1 = int ($n / (256**2));
    my $r1 = $n - $n1*(256**2);
    die "Número desbocado" if $n1 > 255;
    my $n2 = int ($r1 / 256);
    my $r2 = $n -$n1*256**2 - $n2*256;
    my $n3 = $r2 / 1 ;
    return chr($n1).chr($n2).chr($n3);
}

sub unpart {
    my $s = shift;
    my $n1 = 256**2 * ord(substr ($s, 0, 1));
    my $n2 = 256 * ord(substr ($s, 1, 1));
    my $n3 = ord(substr ($s, 2, 1));
    return $n1+$n2+$n3;
}
