#!/usr/bin/perl -w use strict; =pod =head1 NAME B - sum up numbers from text files =head1 SYNOPSIS B [I] [I ...] =head1 DESCRIPTION B is a more sophisticated replacement for Suso Banderas' B. It adds up numbers embedded in text files, discarding all non-numerical data. In particular it can add up tabulated ASCII numerical data separated by commas or white space. B processes the files given on the command line, or standard input if the first file argument is "-" or none is given. The options B<-f>, B<-g> and B<-i> determine which kind of numbers B looks for - floating point in decimal and optionally exponential notation or just integers. Most other options control its output - just the total, or row and/or column sums and optionally partial sums. The B<-s> option gives you mean and standard deviation instead of sums. The current version uses compensated (Kahan) summation for sums and the Knuth/Welford algorithm for computing the mean and standard deviation to minimise numerical errors. =head1 OPTIONS =over =item B<-a> Print requested sums for all files separately, instead of a grand total at the end. This applies to totals (B<-t>) and column sums (B<-c>). It has no effect when reading from stdin. B<+a> disables this feature. =item B<-c> Print column sums, separated by tabulator characters. B<+c> disables this feature. =item B<-f> Sum up floating-point numbers in decimal (not exponential) notation. "e" and "E" are regarded as part of record separators. =item B<-g> Sum up floating-point numbers in decimal or exponential notation. This is the default. =item B<-i> Sum up integers. Period characters are taken as part of record separators, not decimal points. =item B<-I> Print input data, without chaff but in its original format. If B<-p> is also given, input columns and cumulative sums alternate, starting with input data. Columns which are not present in a line but for which partial sums exist are printed as 0 to keep the alignment predictable, but do not count as a zero datum towards the mean for B<-s>. B<+I> disables this feature. =item B<-p> Print partial column sums as columns are added up. If B<-a> is given, the partial sums are file by file, otherwise they are cumulative across files. If B<-s> is given, the mean and standard deviation of each column's data so far is printed instead of the partial sum. B<+p> disables this feature. =item B<-r> Print row sums. For rows which do not contain any numbers, an empty line (not 0) is output. If B<-p> is also given, the row sum is the last column output (or for B<-s>, the last two columns), after the partial column sums. B<+r> disables this feature. =item B<-s> Instead of each sum, print mean and standard deviation. Affects all of B<-c>, B<-p>, B<-r> and B<-t>. Two values will be printed everywhere in place of the single sum. B<+s> disables this feature. =item B<-t> Print overall total. This is enabled by default. B<+t> disables it. =back =head1 FREQUENTLY ANTICIPATED QUESTIONS (FAQ) =over =item Why can't I set the record separator? Because B ignores all non-numerical data anyway, this is superfluous. =item Why can't I tell B to print specific row/column sums only? Because you can use B to pick out rows and B or B to select columns. =item Why can't B print the integer/fractional part of its sum as B can? Because B can do that after B has printed the sum. (C resp. C<...% 1>) =item How can I select the complete contiguous output of B after pasting something into its stdin? Use B by Simon Tatham, L. =back You get the drift. =head1 SEE ALSO B(1), B(1), B(1), B(1), B(1) =head1 COPYLEFT B is Copyright (c) 2009-2010 Volker Schatz. It may be copied and/or modified under the same terms as Perl. =cut my $numopts= qr/[ifg]/; my $modeopts= qr/[acIprst]/; my $numfmt= "g"; my %mode= ( "t" => 1 ); while( @ARGV && $ARGV[0] =~ /^[-+]./ ) { my $opt= shift @ARGV; while( length($opt) > 1 ) { if( $opt =~ /^-($numopts)/o ) { $numfmt= $1; } elsif( $opt =~ /^-($modeopts)/o ) { $mode{$1}= 1; } elsif( $opt =~ /^\+($modeopts)/o ) { $mode{$1}= 0; } else { print STDERR "nsum: Unknown option ", substr($opt,0,2), ".\n" unless $opt =~ /^-h/; print STDERR < nsum.1 EOF exit 1; } $opt =~ s/^-./-/; $opt =~ s/^\+./\+/; } } if( ! grep $_ =~ /^[cCIprt]$/ && $mode{$_}, keys %mode ) { print STDERR "nsum: All sums disabled, nothing to do.\n"; exit 1; } # Shortcut for deciding if any sum is to be computed $mode{"+"}= grep $_ =~ /^[cprt]$/ && $mode{$_}, keys %mode; # Shortcut for deciding if anything is to be printed in each row $mode{"-"}= grep $_ =~ /^[Ipr]$/ && $mode{$_}, keys %mode; # Output field separator my $sep= "\t"; # Nomenclature: # Without -s: *sum sum of values, *aux error compensation variable # With -s: *sum mean of values, *aux auxiliary sum for computing variance, *num # number of values added up # Grand totals: my $tsum= 0; my $taux= 0; my $tnum= 0; # Column sums: my @csums; my @cauxs; my @cnums; my %numregexes= ( "i" => qr/-?\d+/, "f" => qr/-?(?:\d+(?:\.\d*)?|\.\d+)/, "g" => qr/-?(?:\d+(?:\.\d*)?|\.\d+)(?:e[-+]?\d+)?/i ); my $nre= $numregexes{$numfmt}; # Add a new value to a sum using compensated (Kahan) summation. # -> New value # Reference to sum # Reference to error carry sub addtosum { my ($val, $sum, $delta)= @_; $val -= $$delta; my $oldsum= $$sum; $$sum += $val; $$delta= ($$sum - $oldsum) - $val; } # Add another value to the running statistical variables using the algorithm by # Knuth and Welford. # -> New value # Reference to number of values # Reference to mean of values # Reference to auxiliary variable for variance sub addtostat { my ($val, $mean, $m2, $num)= @_; ++$$num; my $delta= $val - $$mean; $$mean += $delta / $$num; $$m2 += $delta * ($val - $$mean); } # Add the elements of one array to another, extending the destination as # needed. If -s is active, the source values are added to the three running # statistical variable arrays. # -> Reference to source array # Reference to destination array / array of means (for -s) # Reference to array of variables for error compensation / variance (if -s) # Reference to array containing counts (if -s is active) sub addary { my ($src, $dest, $aux, $cnt)= @_; if( $#$src > $#$dest ) { my $news= $#$src - $#$dest; push @$dest, (0) x $news; push @$aux, (0) x $news; push @$cnt, (0) x $news; } if( $mode{"s"} ) { for my $ind (0..$#$src) { addtostat $$src[$ind], \$$dest[$ind], \$$aux[$ind], \$$cnt[$ind]; } } else { for my $ind (0..$#$src) { addtosum $$src[$ind], \$$dest[$ind], \$$aux[$ind]; } } } # Print sum or (for -s) mean and standard deviation, separated by space. # -> Sum of values / mean of values (if -s is active) # Auxiliary variable for error compensation / variance (if -s is active) # Number of values (used if -s is active) sub printsum { my ($sum, $aux, $num)= @_; if( $mode{"s"} ) { print $sum, $sep, sqrt($aux/$num); } else { print $sum-$aux; } } # Print output data for one row. sub printrow { my ($in, $csums, $cauxs, $cnums, $rsum, $raux, $rnum)= @_; my $nextsep= ""; if( defined($csums) ) { for my $ind (0..$#$csums) { if( defined($in) ) { print $$in[$ind] || 0, $sep; } printsum $$csums[$ind], $$cauxs[$ind], $$cnums[$ind]; print $sep unless $ind == $#$csums; } $nextsep= $sep; } elsif( defined($in) ) { print join($sep, @$in); $nextsep= $sep; } if( defined($rsum) ) { print $nextsep; printsum $rsum, $raux, $rnum; } print "\n"; } # Process one file. # -> Reference to IO handle sub procfile { my ($IN)= @_; # File totals: my $fsum= 0; my $faux= 0; my $fnum= 0; my $fsumref= $mode{"a"}? \$fsum : \$tsum; my $fauxref= $mode{"a"}? \$faux : \$taux; my $fnumref= $mode{"a"}? \$fnum : \$tnum; # File column sums: my $csumref= $mode{"a"}? [] : \@csums; my $cauxref= $mode{"a"}? [] : \@cauxs; my $cnumref= $mode{"a"}? [] : \@cnums; my $addvalue= $mode{"s"}? \&addtostat : \&addtosum; while( <$IN> ) { my @nums= /.*?($nre)/og; # Row sums: my $rsum= 0; my $raux= 0; my $rnum= 0; if( @nums ) { if( $mode{"c"} || $mode{"p"} ) { addary \@nums, $csumref, $cauxref, $cnumref; } if( $mode{"r"} ) { for (@nums) { &$addvalue($_, \$rsum, \$raux, \$rnum); } } if( $mode{"t"} ) { for (@nums) { &$addvalue($_, $fsumref, $fauxref, $fnumref); } } printrow $mode{"I"}? \@nums : undef, $mode{"p"}? $csumref : undef, $cauxref, $cnumref, $mode{"r"}? $rsum : undef, $raux, $rnum if $mode{"-"}; } elsif( $mode{"-"} ) { print "\n"; } } if( $mode{"a"} ) { printrow undef, $csumref, $cauxref, $cnumref if $mode{"c"}; if( $mode{"t"} ) { printsum $fsum, $faux, $fnum; print "\n"; } } } if( !@ARGV || $ARGV[0] eq "-" ) { $mode{"a"}= 0; procfile *STDIN{IO}; } else { for (@ARGV) { open( my $hdl, $_ ); unless( $hdl ) { print STDERR "nsum: Cannot open `$_'. Skipping.\n"; next; } procfile $hdl; close $hdl; } } unless( $mode{"a"} ) { printrow undef, \@csums, \@cauxs, \@cnums if $mode{"c"}; if( $mode{"t"} ) { printsum $tsum, $taux, $tnum; print "\n"; } }