#! /bin/bash

# Create input files to the hotspot process.

gnom=_GENOME_

# Tags file in bam format (file extension .bam), or starched bed file
# (file extension .bed.starch).  If the tags file is in bed.starch
# format, and if it is in the directory specified by _OUTDIR_, then it
# needs to be in the format
#
#     chr  5'start  5'start+1
#
# That is, the file should be three column bed (no ID field, etc.)
# containing the 1bp coordinates of the 5' ends of the tags. If
# _TAGS_ is a bam file, or a bed.starch file not in _OUTDIR_, then
# the bed.starch file in the above format will be generated (using
# the strand column if present), and put in _OUTDIR_.
#
# If Input tags file is to be used, the same rules apply.
tags=_TAGS_
useinput=_USE_INPUT_
tagsInput=_INPUT_TAGS_

# Hotspot lib files will be written here.  Hotspot requires a "lib"
# file, not a bed file. lib file will be created by this script from
# the original bam file, if necessary, and will go in the following
# location.  The name of the lib file will be same as the tags file,
# with extension bed.starch replaced by lib.filter.txt or
# lib.filter.nodup.txt, the latter being chosen if variable dupok is
# not "T".
libd=_OUTDIR_

# Chromosome start, stop positions
chroms=_CHROM_FILE_

# Duplicate tags OK?  (Set to T if yes - for DNaseI data, for instance; anything else - for ChIP, for instance - means no.)
dupok=_DUPOK_

check=_CHECK_

thisscr="run_make_lib"
echo
echo $thisscr

proj=`basename $tags | sed s/\.bam$// | sed s/\.bed.starch$//`
tagsb=$libd/$proj.bed.starch

if [ ! -e $tagsb ]; then
    echo "$thisscr: creating bed file..."
    test=$(echo $tags | grep "\.bam$")
    if [ ${#test} != 0 ]; then
	bamToBed -i $tags \
	    | awk 'BEGIN{OFS="\t"}{if($6 == "-") $2=$3-1; print $1, $2, $2+1}' \
            | sort-bed - \
	    | starch - \
	    > $tagsb
    else
	test=$(echo $tags | grep "\.bed\.starch$")
	if [ ${#test} != 0 ]; then
	    unstarch $tags \
       		| awk 'BEGIN{OFS="\t"}{if($6 == "-") $2=$3-1; print $1, $2, $2+1}' \
		| sort-bed - \
		| starch - \
		> $tagsb
	else
	    echo "$thisscr: $tags must end in .bam or .bed.starch"
	    exit
	fi
    fi
fi

if [ $useinput == "T" ]; then
    projInput=`basename $tagsInput | sed s/\.bam$// | sed s/\.bed.starch$//`
    tagsInputB=$libd/$projInput.bed.starch
    if [ ! -e $tagsInputB ]; then
	echo "$thisscr: creating input tags bed file..."
	test=$(echo $tagsInput | grep "\.bam$")
	if [ ${#test} != 0 ]; then
	    bamToBed -i $tagsInput \
		| awk 'BEGIN{OFS="\t"}{if($6 == "-") $2=$3-1; print $1, $2, $2+1}' \
		| sort-bed - \
		| starch - \
		> $tagsInputB
	else
	    test=$(echo $tagsInput | grep "\.bed\.starch$")
	    if [ ${#test} != 0 ]; then
		unstarch $tagsInput \
		    | awk 'BEGIN{OFS="\t"}{if($6 == "-") $2=$3-1; print $1, $2, $2+1}' \
		    | sort-bed - \
		    | starch - \
		    > $tagsInputB
	    else
		echo "$thisscr: $tagsInput must end in .bam or .bed.starch"
		exit
	    fi
	fi
    fi
fi

if [ $dupok == "T" ]; then
    lib=$libd/$proj.lib.filter.txt
else
    lib=$libd/$proj.lib.filter.nodup.txt
fi
if [ $check == "T" ] && [ -f $lib ] && [ -s $lib ]; then
    echo "$thisscr: $lib exists; skipping"
    exit 0
fi

echo "$thisscr: creating lib file..."
if [ $dupok == "T" ]; then
    unstarch $tagsb \
        | bedops -e -1 - $chroms \
        | awk '{print $1, $2}' - \
        | tee $lib \
        | wc -l > $lib.counts
else
    unstarch $tagsb \
        | bedops -e -1 - $chroms \
        | awk '{print $1, $2}' - \
        | uniq - \
        | tee $lib \
        | wc -l > $lib.counts
fi
