#!/usr/bin/perl -w

use 5.032;

# use Compress::Zstd;

# local $/ = undef;

my @cont;
my @dest;
my $content;
my $file = $ARGV[0] // "AMillionRandomDigits.bin";
if (-s $file > 16777216) { die "Tamaño no soportado"; }
open IN, '<' , $file or die $!;
while (<IN>){
    $content .= $_;
}
close IN;

my $l0 = length($content);

say "origen  $l0";



goto C;


goto B;

A:
say "FASE A";

my $p = int $l0/10;

my $r = $l0 - $p*10; 

my ($sum, $sum2);
my @dest2;
for my $i (0..9){
    $cont[$i] = substr $content, 0, $p, "";
    if ($i == 9) {
	$cont[$i] .= $content;
    }
    say "origen $i = ", length($cont[$i]);
#    system "echo $cont[$i] > temp.bin";
    open OUT, '>', "temp.bin" or die $!;
    print OUT $cont[$i];
    close OUT;
    
    $dest2[$i] = `xz -9e --stdout temp.bin`; 
    $dest[$i] = compress($cont[$i], 100);
    say "destino $i = ", length($dest[$i]);
    say "destino2 $i = ", length($dest2[$i]);
    $sum += length($dest[$i]);
    $sum2 += length($dest2[$i]);
}

say "suma = $sum";
say "suma2 = $sum2";

# my $dest = compress($content, 100);





say "FASE B";
B:
goto C;

my %hash;
my @eq;
my $tmp;
system "xxd AMillionRandomDigits.bin > hex.txt";
open IN, '<', "hex.txt" or die $!;
while (my $line = <IN>){
    chomp $line;
    my ($counter, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $p8, $tmp) = split /\s/, $line;
    $counter =~ s/://;
    $counter =~ s/^0+//;
    my $c = 0;
    for ($p1, $p2, $p3, $p4, $p5, $p6, $p7, $p8){
	$c++;
	if (defined $_){
	    $hash{$_}++;
	    push @{ $eq[hex $_] }, $counter.$c;  # (hex + no. 1-8) 
	}
    }
}
close IN;


for my $i (keys %hash){
    say $hash{$i};
    say "@{ $eq[hex $i] }";
#    say "";
}

# while (my ($i,$j) = each %hash){

# my @oeq = sort { scalar(@$a) <=> scalar(@$b) } @eq;

# for my $j (@oeq){
#    for my $k (@$j){
#	say $k;
#    }
#    say "";
# }

open OUT, '>', "position.txt" or die $!; 
for my $i (keys %hash){
    if ($hash{$i} >= 2){
	my $k = 0;
	for my $j (@{ $eq[hex $i] }){
	    if ($j ne @{ $eq[hex $i] }[0]){
		print OUT hex($j)-hex(@{ $eq[hex $i] }[$k]), " ";
		$k++;
	    }else{
		print OUT $j," ";   
	    }
	}
	say OUT "";
    }else{
        say OUT @{ $eq[hex $i] }[0];
    }
}
close OUT;





C:
  say "FASE C";
# my $pos = 0;
my $DEBUG = 1;
my $SIZE = 10;    # importante
my %hc;
my %ho;
# my $s;
my $copia = $content;
while (my $chunk = substr ($content, 0, $SIZE, '')){           # extraigo 5*8 = 40 bits
    $hc{$chunk}++;
    if (length($chunk) < $SIZE){
	$hc{$chunk} = 0;                         # esto es importante para que el cacho del final, que es más corto, quede siempre
    }                                            # en último lugar de la ordenación. Si no, corrompe las cadenas
}
# for my $i (keys %hc){
#    $s += $hc{$i};
# }
# say "$s chunks" if $DEBUG;
my @odict = sort { $hc{$b} <=> $hc{$a} } keys %hc;                      # DICCIONARIO PRINCIPAL
open OUT, ">", "millionbits9.dict" or die $!;
for (@odict){
    print OUT $_;
}
close OUT;
say scalar(@odict), " number of ordered chunks" if $DEBUG; 


### compresión

my $kounter = 0;
for (@odict){
#    say bin($kounter) if $DEBUG;
#    $ho{$_} = pack "L", ($kounter);
    $ho{$_} = part($kounter);                        # 2 niveles de compresión: en %ho no están todos los chunk, sino solo los que no se repiten
    $kounter++;                                      # el segundo nivel es que el binario asociado sea más corto para los más frecuentes.
}
open OUT, ">", "millionbits9.comp" or die $!;
while (my $chunk = substr ($copia, 0, $SIZE, '')){
    print OUT $ho{$chunk};                                 # escribo números de 24 bits en caractéres
}
close OUT;


### descompresión

use Fcntl;

my @dict;
my $bytes;
my $k = 0;
sysopen IN, "millionbits9.dict", O_RDONLY or die $!;
do {
    sysread IN, $bytes, $SIZE;
    $dict[$k] = $bytes;
    $k++;
}until (length($bytes) < $SIZE);
close IN;

open IN, "<", "millionbits9.comp" or die $!;
my $comp;
while (<IN>){
    $comp .= $_;
}
close IN;

my $num;
open OUT, ">", "millionbits9" or die $!;
while (my $chunk = substr ($comp, 0, 3, '')){                 # debo leer números de 24 bits  (3*8 = 24)
#    $num = unpack "L" , $chunk;
    if (length($chunk)<3){
	$chunk = "\0" x (3-length($chunk)) . $chunk;
    }
    $num = unpart($chunk);
    print OUT $dict[$num];
}
close OUT;

if ($DEBUG){
    say "OK" if (1230 == unpart(part(1230)));
}
    
exit 1;

sub bin{                                        # esto debe sustituir a pack/unpack "L" que es 4 bytes, por algo de solo 3 bytes
    my $n = shift;                              # posteriormente en caractéres pack "CCC"
    my $bin = "";
    for (0..23){
	if ($n & 2**$_){
	    $bin .= "0";
	}else{
	    $bin .= "1";
	}
	last if $n <= 2**$_;
    }
    return $bin;
}

sub part {
    my $n = shift;
    my $n1 = int ($n / (256**2));
    my $r1 = $n - $n1*(256**2);
    die "Número desbocado" if $n1 > 255;
    my $n2 = int ($r1 / 256);
    my $r2 = $n -$n1*256**2 - $n2*256;
    my $n3 = $r2 / 1 ;
    return chr($n1).chr($n2).chr($n3);
}

sub unpart {
    my $s = shift;
    my $n1 = 256**2 * ord(substr ($s, 0, 1));
    my $n2 = 256 * ord(substr ($s, 1, 1));
    my $n3 = ord(substr ($s, 2, 1));
    return $n1+$n2+$n3;
}
