#!/usr/bin/perl use 5.040; my $DEBUG = 0; # A la versión 3 va la vencida :-) # Example usage: my $N = 200; my @x; @x = 0 .. $N-1; my @y; my $cc = 0; $y[$cc++] = $_ * rand() for (@x); my ($alpha, $beta); ($alpha, $beta) = simple_linear_regression(\@x, \@y); say "alpha = $alpha beta = $beta"; my $new_x = $N; my $predicted_y = $alpha + $beta * $new_x; # confianza 0.95; # que equivale 1.96 desviaciones my $confidence_interval = 1.96 * stddev(); # * $variance say "Confidence interval = + or - $confidence_interval"; say "Interval for x=$new_x: [",$predicted_y - $confidence_interval, " , ", $predicted_y + $confidence_interval,"]"; # my $str = ; my $c = 0; my ($ok, $ko,$ko1, $ko2) = (0,0,0,0); my $isoutlier; while (++$c < $N * 10){ my $new_y = $N * rand(); # Example value for the new data point's y-value # my $is_outlier = evaluate_new_data($alpha, $beta, $new_x, $new_y); if (abs($predicted_y - $new_y) > $confidence_interval) { $isoutlier = 1; # The new data point is an outlier }else{ $isoutlier = 0; } if ($isoutlier) { $ko++; print "$c -> $ko) The new data point (x = $new_x, y = $new_y) is an outlier at a 95% confidence level. "; if ($new_y > $predicted_y){ say "Excess"; $ko1++; }else{ say "Undervalue"; $ko2++; } } else { $ok++; # say "$c -> $ok) The new data point (x = $new_x, y = $new_y) is NOT an outlier at a 95% confidence level."; } } say "\n$ok counts on the interval, $ko outliers, ",100*$ko/($ok+$ko), "%"; say "$ko1 are exceding the interval, $ko2 lowering"; exit 3; sub simple_linear_regression { my @x = @{$_[0]}; my @y = @{$_[1]}; my $n = scalar(@x); # Calculate sums my $sum_x = sum(@x); my $sum_y = sum(@y); my $sum_xy = 0; my $sum_x_squared = 0; for (my $i = 0; $i < $n; $i++) { $sum_xy += $x[$i] * $y[$i] if (defined $x[$i] && defined $y[$i]); $sum_x_squared += $x[$i] * $x[$i]; } # Calculate beta my $beta; if ($n*$sum_x_squared-($sum_x**2) != 0){ $beta = ($n * $sum_xy - $sum_x * $sum_y) / ($n * $sum_x_squared - $sum_x * $sum_x); }else{ die "error de 0"; } # Calculate alpha my $alpha = ($sum_y - $beta * $sum_x) / $n; return ($alpha, $beta); } sub sum { my $total = 0; $total += $_ for @_; return $total; } sub mean { my $sum = 0; my $total = 0; $total = sum(@y); return ($total / scalar(@y)); } sub stddev { my $sumd = 0; my $degrees = $N-1; for my $i (@y){ $sumd += ($i - mean())**2; } my $sd = sqrt($sumd/$degrees); return $sd; } __END__ So as outliers are common, it may be useful to label quantitative data.