tblmerge

#!/usr/bin/perl
# tblmerge: join CSV tables
# Copyright(c) 2008 EURAC, Institute of Genetic Medicine
use strict;
use warnings;
use locale;
use Getopt::Std;

my %flags;
getopts("t12L:C:E:vkig:nch", \%flags);
my $SEP = ($ENV{'TBLSEP'} || "\t");
my ($mergeOn, $file1, $file2) = @ARGV;
if(!$mergeOn || !$file1 || !$file2 || defined($flags{'h'}))
{
  print(STDERR qq{$0 [-t12LCEivkgnc] key file1 file2:
Perform a (full) join on 'key' of two CSV files, performing a comparison of
common columns. CSV files are TAB separated, containing column labels on the
first row. You can change the column separator by setting the TBLSEP
environment variable.

  -c:		common records only, all records otherwise
  -t:		strip spaces from cells
  -n:		normalize all numbers
  -1:		use first file as reference for different values
  -2:		use second file as reference
  -g N:		guess column pairs by comparing N cells, then exit
  -v:		verbose (show duplicate references anyway)
  -k:		keep going as far as possible on errors conditions
  -i:		ignore empty cells for comparisons
  -L expr:	run Perl 'expr' expression on each label
  -C expr:	run Perl 'expr' expression on each cell
  -E expr:	run Perl 'expr' expression on all cells and labels
  -h:		help summary
});
  exit(2);
}

sub stripSpaces($)
{
  s/\s*$//;
  s/^\s*//g;
}

sub normNumbers($)
{
  # strip spaces
  if(s/^\s*([+-]|)\s*([0-9]*\.?[0-9]*)\s*(%|)\s*$/$1$2$3/)
  {
    # remove leading zeros
    s/^([+-]|)0*([1-9])/$1$2/;

    # .x => 0.x
    s/^([+-]|)\./${1}0./;

    # 0.0 => 0
    s/^([+-]|)0+(?:\.0*)?(%|)$/${1}0$2/;

    # trailing zeros
    s/(\.[0-9]*[1-9]+)0+(%|)$/$1$2/;

    # x.0 => x
    s/\.0*(%|)$/$1/;
  }
}

sub fetchRow($)
{
  my @cols = split($SEP, $_, -1);
  map({ stripSpaces($_); } @cols) if(defined($flags{'t'}));
  map({ normNumbers($_); } @cols) if(defined($flags{'n'}));
  map({ eval($flags{'E'}) } @cols) if(defined($flags{'E'}));
  return @cols;
}

sub isempty(\$)
{
  my $v = shift();
  return !(defined($$v) && ($$v || $$v =~ /^\d+$/));
}

sub notundef(\$)
{
  my $v = shift();
  return (defined($$v) && ($$v || $$v =~ /^\d+$/)? $$v: "");
}

sub aorb(\$\$)
{
  my ($a, $b) = @_;
  return (isempty($$a)? $$b: $$a);
}

sub loadTbl($$)
{
  my ($tbl, $file) = @_;
  open(FD, $file) or die("cannot open $file: $!\n");

  # headers
  $_ = <FD>;
  s/[\r\n]+$//;
  $tbl->{'cols'} = [fetchRow($_)];
  map({ eval($flags{'L'}) } @{$tbl->{'cols'}}) if(defined($flags{'L'}));

  my $l = 0;
  while(<FD>)
  {
    s/[\r\n]+$//;
    my @data = fetchRow($_);
    die("line error at $file:$.:$#data") if($#data != $#{$tbl->{'cols'}});
    map({ eval($flags{'C'}) } @data) if(defined($flags{'C'}));

    my $n = 0;
    foreach my $col(@{$tbl->{'cols'}})
    {
      if(!defined($tbl->{'data'}{$col}[$l])
      || (defined($flags{'i'}) && isempty($tbl->{'data'}{$col}[$l])))
      { $tbl->{'data'}{$col}[$l] = notundef($data[$n]); }
      else
      {
	my $cond = ($tbl->{'data'}{$col}[$l] ne $data[$n]);
	if($cond || defined($flags{'v'}))
	{
	  print(STDERR "column $col: differing values while performing column reduction\n" .
		"  old: $tbl->{'data'}{$col}[$l]\n" .
		"  new: $data[$n]\n");
	  die("cannot continue") if($cond && !defined($flags{'k'}));
	}
      }
      ++$n;
    }
    ++$l;
  }
  $tbl->{'height'} = $l;
  $tbl->{'cols'} = [keys(%{$tbl->{'data'}})];

  close(FD);
}

sub getRow($$)
{
  my ($tbl, $l) = @_;
  my @line;
  push(@line, $tbl->{'data'}{$_}[$l]) foreach(@{$tbl->{'cols'}});
  return @line;
}

sub genMap($$$)
{
  my ($tbl, $tblNo, $col) = @_;
  foreach my $l(0 .. ($tbl->{'height'} - 1))
  {
    my $mv = $tbl->{'data'}{$mergeOn}[$l];
    if(!defined($tbl->{'map'}{$mv}))
    { $tbl->{'map'}{$mv} = $l; }
    else
    {
      my $ol = $tbl->{'map'}{$mv};
      my $oline = join($SEP, getRow($tbl, $ol));
      my $line = join($SEP, getRow($tbl, $l));
      my $cond = ($oline ne $line);
      if($cond || defined($flags{'v'}))
      {
	print(STDERR "duplicate $col in table $tblNo: $mv (old position: $ol, new: $l)\n");
	print(STDERR "  old: $oline\n");
	print(STDERR "  new: $line\n");
	die("cannot continue") if($cond && !defined($flags{'k'}));
      }
    }
  }
}

sub pushSimpleRecord($$$)
{
  my ($res, $tbl, $l) = @_;
  my $lr = $res->{'height'};
  foreach my $col(@{$res->{'cols'}})
  {
    $res->{'data'}{$col}[$lr] = notundef($tbl->{'data'}{$col}[$l]);
  }
  ++$res->{'height'};
}

sub pushSharedRecord($$$$$$)
{
  my ($res, $cmp, $tbl1, $l1, $tbl2, $l2) = @_;
  my $lr = $res->{'height'};
  foreach my $col(@{$res->{'cols'}})
  {
    if(!defined($cmp->{$col})
    || (defined($flags{'i'})
     && (isempty($tbl1->{'data'}{$col}[$l1])
      || isempty($tbl2->{'data'}{$col}[$l2]))))
    {
      my $va = notundef($tbl1->{'data'}{$col}[$l1]);
      my $vb = notundef($tbl2->{'data'}{$col}[$l2]);
      $res->{'data'}{$col}[$lr] = aorb($va, $vb);
    }
    else
    {
      if($tbl1->{'data'}{$col}[$l1] eq $tbl2->{'data'}{$col}[$l2])
      { $res->{'data'}{$col}[$lr] = notundef($tbl1->{'data'}{$col}[$l1]); }
      else
      {
	print(STDERR "difference in $tbl1->{'data'}{$mergeOn}[$l1]/$col: " .
	    "(1:$l1:$tbl1->{'data'}{$col}[$l1])" .
	    "\t(2:$l2:$tbl2->{'data'}{$col}[$l2])\n");

	if(defined($flags{'1'}))
	{ $res->{'data'}{$col}[$lr] = notundef($tbl1->{'data'}{$col}[$l1]); }
	elsif(defined($flags{'2'}))
	{ $res->{'data'}{$col}[$lr] = notundef($tbl2->{'data'}{$col}[$l2]); }
	else
	{ die("cannot continue"); }
      }
    }
  }
  ++$res->{'height'};
}

sub buildRecords($$$$$)
{
  my ($res, $cmp, $mergeOn, $tbl1, $tbl2) = @_;
  my @uindex;

  # all common unique records, table 1
  foreach my $l(values(%{$tbl1->{'map'}}))
  {
    my $mv = $tbl1->{'data'}{$mergeOn}[$l];
    if(!defined($tbl2->{'map'}{$mv}))
    {
      if(!defined($flags{'c'}))
      {
	push(@uindex, [$mv, 1]);
	pushSimpleRecord($res, $tbl1, $l);
      }
    }
    else
    {
      # matching map
      my $l2 = $tbl2->{'map'}{$mv};
      pushSharedRecord($res, $cmp, $tbl1, $l, $tbl2, $l2);
    }
  }

  # uncommon unique records, table 2
  if(!defined($flags{'c'}))
  {
    foreach my $l(values(%{$tbl2->{'map'}}))
    {
      my $mv = $tbl2->{'data'}{$mergeOn}[$l];
      if(!defined($tbl1->{'map'}{$mv}))
      {
	push(@uindex, [$mv, 2]);
	pushSimpleRecord($res, $tbl2, $l);
      }
    }
  }

  return @uindex;
}

sub min($$)
{
  return ($_[0] <= $_[1]? $_[0]: $_[1]);
}

sub guessColumnPairs($$$)
{
  my ($records, $tbl1, $tbl2) = @_;
  my %stats;

  if(!$records
  || $records > $tbl1->{'height'}
  || $records > $tbl2->{'height'})
  {
    $records = min($tbl1->{'height'}, $tbl2->{'height'});
    print(STDERR "guessing on $records records\n");
  }

  foreach my $col1(@{$tbl1->{'cols'}})
  {
    foreach my $col2(@{$tbl2->{'cols'}})
    {
      next if($col1 eq $col2);
      my @id = sort(($col1, $col2));
      next if(defined($stats{$id[0]}{$id[1]}));

      foreach my $l(0 .. ($records - 1))
      {
	my $rl = $tbl2->{'map'}{$tbl1->{'data'}{$mergeOn}[$l]} || $l;
	if($tbl1->{'data'}{$col1}[$l] eq $tbl2->{'data'}{$col2}[$rl])
	{ ++$stats{$id[0]}{$id[1]}; }
      }
    }
  }

  my @expr;
  foreach my $col1(keys %stats)
  {
    foreach my $col2(keys %{$stats{$col1}})
    {
      if($stats{$col1}{$col2} > $records / 2)
      {
	my $pc = int($stats{$col1}{$col2} * 100 / $records);
	print(STDERR "possible column match: $col1 = $col2 ($pc% confidence)\n");
	push(@expr, "s/^\Q$col2\E\$/\Q$col1\E/;");
      }
    }
  }

  print(STDERR "labels expression: " . join(' ', @expr) . "\n") if(@expr);
}

sub find(\@$)
{
  my ($arr, $elem) = @_;
  foreach my $p(0 .. $#{$arr})
  {
    return $p if($arr->[$p] eq $elem);
  }
  return -1;
}


# load tables
my %tbl1;
loadTbl(\%tbl1, $file1);
defined($tbl1{'data'}{$mergeOn}) or die("column $mergeOn not available in table 1\n");
genMap(\%tbl1, 1, $mergeOn);

my %tbl2;
loadTbl(\%tbl2, $file2);
defined($tbl2{'data'}{$mergeOn}) or die("column $mergeOn not available in table 2\n");
genMap(\%tbl2, 2, $mergeOn);

# try to guess column aliases
if(defined($flags{'g'}))
{
  guessColumnPairs($flags{'g'}, \%tbl1, \%tbl2);
  exit(0);
}

# construct list of unique columns
my %cmp;
foreach my $col1(@{$tbl1{'cols'}})
{
  foreach my $col2(@{$tbl2{'cols'}})
  {
    $cmp{$col1} = 1
      if($col1 eq $col2);
  }
}

my %res;
$res{'height'} = 0;
$res{'cols'} = [];
foreach my $c(@{$tbl1{'cols'}}, @{$tbl2{'cols'}})
{
  push(@{$res{'cols'}}, $c) if(find(@{$res{'cols'}}, $c) < 0);
}

# stats
my @cmpArr = keys(%cmp);
print(STDERR (defined($flags{'c'})? "inner": "full") . " join on $mergeOn\n");
print(STDERR "joining $tbl1{'height'} + $tbl2{'height'} rows\n");
print(STDERR "shared columns: " . join(' ', @cmpArr) . "\n");

my @uniq;
foreach my $c(@{$res{'cols'}})
{
  push(@uniq, $c) if(find(@cmpArr, $c) < 0);
}
print(STDERR "unique columns: " . join(' ', @uniq) . "\n");

# build the resulting table
my @uindex = buildRecords(\%res, \%cmp, $mergeOn, \%tbl1, \%tbl2);
print(STDERR "resulting rows: $res{'height'}\n");
print(STDERR "resulting columns:\n  " . join("\n  ", sort({ lc($a) cmp lc($b)} @{$res{'cols'}})) . "\n");

# more stats
if(@uindex)
{
  print(STDERR "unique indexes:\n");
  @uindex = sort({ lc($a->[0]) cmp lc($b->[0]) } @uindex);
  foreach my $ui(@uindex)
  {
    print(STDERR "  $ui->[0] (table $ui->[1])\n");
  }
}

# output table
print(join($SEP, @{$res{'cols'}}) . "\n");
foreach my $l(0 .. ($res{'height'} - 1))
{
  my @row = getRow(\%res, $l);
  print(join($SEP, @row) . "\n");
}