#!/usr/bin/perl

use 5.040;

my $DEBUG =  0;

# A la versión 3 va la vencida :-)

# Example usage:
my $N = 200;

my @x;
@x = 0 .. $N-1;
my @y;
my $cc = 0;
$y[$cc++] = $_ * rand() for (@x);

my ($alpha, $beta);
($alpha, $beta) = simple_linear_regression(\@x, \@y);
say "alpha = $alpha   beta = $beta";

my $new_x = $N;

my $predicted_y = $alpha + $beta * $new_x;
#   confianza 0.95;   # que equivale 1.96 desviaciones
my $confidence_interval = 1.96 * stddev();     # * $variance
say "Confidence interval = + or - $confidence_interval";
say "Interval for x=$new_x: [",$predicted_y - $confidence_interval, " , ", $predicted_y + $confidence_interval,"]";
# my $str = <STDIN>;
my $c = 0;
my ($ok, $ko,$ko1, $ko2) = (0,0,0,0);
my $isoutlier;
while (++$c < $N * 10){
    my $new_y = $N * rand();  # Example value for the new data point's y-value
    
#    my $is_outlier = evaluate_new_data($alpha, $beta, $new_x, $new_y);

    if (abs($predicted_y - $new_y) > $confidence_interval) {
        $isoutlier = 1;  # The new data point is an outlier
    }else{
	$isoutlier = 0;
    }

    if ($isoutlier) {
	$ko++;
	print "$c -> $ko) The new data point (x = $new_x, y = $new_y) is an outlier at a 95% confidence level. ";
	if ($new_y > $predicted_y){
	    say "Excess";
	    $ko1++;
	}else{
	    say "Undervalue";
	    $ko2++;
	}
    } else {
	$ok++;
#	say "$c -> $ok) The new data point (x = $new_x, y = $new_y) is NOT an outlier at a 95% confidence level.";
    }
}
say "\n$ok counts on the interval, $ko outliers, ",100*$ko/($ok+$ko), "%";
say "$ko1 are exceding the interval, $ko2 lowering";
exit 3;


sub simple_linear_regression {
    my @x = @{$_[0]};
    my @y = @{$_[1]};

    my $n = scalar(@x);

    # Calculate sums
    my $sum_x = sum(@x);
    my $sum_y = sum(@y);

    my $sum_xy = 0;
    my $sum_x_squared = 0;
    for (my $i = 0; $i < $n; $i++) {
        $sum_xy += $x[$i] * $y[$i] if (defined $x[$i] && defined $y[$i]);
        $sum_x_squared += $x[$i] * $x[$i];
    }

    # Calculate beta
    my $beta;
    if ($n*$sum_x_squared-($sum_x**2) != 0){
	$beta = ($n * $sum_xy - $sum_x * $sum_y) / ($n * $sum_x_squared - $sum_x * $sum_x); 
    }else{
	die "error de 0";
    }
    # Calculate alpha
    my $alpha = ($sum_y - $beta * $sum_x) / $n;

    return ($alpha, $beta);
}

sub sum {
    my $total = 0;
    $total += $_ for @_;
    return $total;
}

sub mean {
    my $sum = 0;
    my $total = 0;
    $total = sum(@y);
    return ($total / scalar(@y));
}

sub stddev {
    my $sumd = 0;
    my $degrees = $N-1;

    for my $i (@y){
	$sumd += ($i - mean())**2;
    }
    my $sd = sqrt($sumd/$degrees);
    return $sd;
}

__END__
  So as outliers are common, it may be useful to label
  quantitative data.
  
