#!/usr/bin/perl -w
# FILE: find-nearest-tx
# AUTHOR: William Stafford Noble, Scott Kuehn
# CREATE DATE: 11/11/2004. Modified, 1/31/2006 by S. Kuehn.
use strict;

my $usage = "USAGE: featDistance <BED> <gene track>\n";

# Define global variables.
my @ids;
my %chrs;
my %starts;
my %ends;
my %tx_starts;

# Parse the command line.
if (scalar(@ARGV) != 2) {
  print(STDERR $usage);
  exit(1);
}
my($bed_file, $gene_file) = @ARGV;

# Read the BED file into memory.
open(BED_FILE, "<$bed_file") || die("Can't open $bed_file.");
my $lines = 0;

# Read it line by line.
while (my $line = <BED_FILE>) {
  chomp($line);
  $lines++;

  # Parse the line.
  (my $chr, my $start, my $end, my $id) = split("\t", $line);

  # If the chromosome ID is missing a "chr," add it on.
  if (substr($chr, 0, 3) ne "chr") {
    $chr = "chr$chr";
  }

  # Add a unique line number delimiter
  $id = $id . "-" . $lines;

  # Store the starts and ends.
  push(@ids, $id);
  $chrs{$id} = $chr;
  $starts{$id} = $start;
  $ends{$id} = $end;
  $tx_starts{$id} = "No-data-" . $chr
}
close(BED_FILE);
printf(STDERR "Found %d BED entries.\n", scalar(@ids));

# Open the gene file.
open(GENE_FILE, "gunzip -c $gene_file|") || die("Can't open $gene_file.");

# Read it line by line.
my $num_genes = 0;
while (my $line = <GENE_FILE>) {
  chomp($line);

  # Parse the line by tabs.
  my @words = split("\t", $line);
  my($gene_id, $chr, $strand, $tx_left, $tx_right) = split("\t", $line);

  # Chooose the transcription start site based on the strandedness.
  my $tx_start;
  if ($strand eq "-") {
    $tx_start = $tx_right;
  } else {
    $tx_start = $tx_left;
  }

  # Consider each entry.
  foreach my $id (@ids) {

    # Check the chromosome.
    if ($chr ne $chrs{$id}) {
      next;
    }

    my $tx_distance = $starts{$id} - $tx_start;

    if (($tx_starts{$id} =~ m/^No-data/) || 
	(abs($tx_distance) < abs($tx_starts{$id}))) {

	# Store in the appropriate location.
	$tx_starts{$id} = $tx_distance;
#	print(STDERR "tx_starts{$id}=$tx_distance\n");
    }
  }
  $num_genes++;
}
close(GENE_FILE);
print(STDERR "Found $num_genes genes.\n");

foreach my $id (@ids) {
  if (!defined($tx_starts{$id})) {
    print(STDERR "Can't find tx_start for $id.\n");
  }

  # Remove the unique line-number delimiter
  $id =~ /(.*)-\d+$/ ;
  printf("%s\t%d\t%d\t%s\t%s\n",
	 $chrs{$id}, $starts{$id}, $ends{$id}, $1, $tx_starts{$id});
}
