#!/bin/bash

#===============================================================================
# The Broad Institute
# SOFTWARE COPYRIGHT NOTICE AGREEMENT
# This software and its documentation are copyright 2012, 2013 by the
# Broad Institute/Massachusetts Institute of Technology. All rights reserved.
#
# This software is supplied without any warranty or guaranteed support whatsoever.
# Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
# functionality.
# 
# @author: Michael S. Noble
# @date:   Feb 15, 2013
#===============================================================================

#  Front matter: env setup, command line args, etc {{{

FilterString="^http|^Length|saved|html|^$|robots|resolv|connect"
Capture="2>&1 | egrep -vi \"$FilterString\""
Version=0.3.11
This=`basename $0`
DataRun=stddata
AnalysisRun=analyses
GDACRootURL=http://gdac.broadinstitute.org/runs
KindOfRun=$AnalysisRun	                # Default to analysis run
Downloaders="wget curl"
BatchMode=0
YYYYMMDD="[2-9][0-9][0-9][0-9]_[0-1][0-9]_[0-3][0-9]"

shopt -s expand_aliases extglob
alias ToUpper="tr '[:lower:]' '[:upper:]'"
alias ToLower="tr '[:upper:]' '[:lower:]'"

# }}}

UniqueSuffix()  # {{{
{
   # This allows multiple inodes to safely co-exist in same top-level
   # dir, by tagging each with unique numeric suffix (starting at 2).

   if [ ! -e $1 ] ; then	# can be file/directory/link/etc
	echo $1
	return
   fi

   i=2
   while [ $i -lt 999999 ] ; do
	if [ -e ${1}.$i ] ; then
	   i=`expr $i + 1`
 	else 
	   break
	fi
   done
   echo ${1}.$i

}  # }}}

Do()  # {{{
{
    eval $Echo "$@" $Capture
}   # }}}

Download()  # {{{
{
    eval $Downloader "$@"
}   # }}}

wget_downloader()  # {{{
{            
    case $1 in
        -f) Opts="-q -O -" ; shift ;;       # quietly send straight to stdout
         *) Opts="--progress=dot:mega -l 2 --no-parent -nH --cut-dirs=3 -P $WhichRun -r"
    esac

    eval wget --ignore-case --cache=off $Opts $Tasks $@

}   # }}}

look_for_downloader()  # {{{
{
    for Downloader in $Downloaders ; do
        Loc=`type $Downloader 2>/dev/null | awk '{print $NF}'`
        if [ -n "$Loc" ] ; then
            break
        fi
    done

    case $Loc in
        */curl)

            echo "curl not supported yet: please find wget on your system"
            echo "and make sure it is in your \$PATH"
            exit 5
            ;;

        */wget)
            Downloader=wget_downloader

            Tasks=
            for Task in $TasksToDownload ; do
                Tasks=",*${Task}*${Tasks}"
            done

            if [ -n "$Tasks" ] ; then
                # First see if -tasks was to include or reject (-A /-R) matches
                IncludeExclude="`echo $Tasks | sed 's/~//g'`"
                if [ "$Tasks" = "$IncludeExclude" ] ; then
                    Tasks="-A \"`echo $Tasks | cut -d, -f2-`\""
                else
                    Tasks="-R \"`echo $IncludeExclude | cut -d, -f2-`\""
                fi
            fi

            ;;

        *)
            echo "Couldn't find one of <$Downloaders> in your \$PATH, aborting"
            exit 4
            ;;
    esac

}   # }}}

pretty_print() # {{{
{
    export N=$1

    awk 'BEGIN { printf "\t" } \
        { i=0; while (i<NF) { \
                printf $++i"  "; \
                if (!(i % ENVIRON["N"])) printf "\n\t"} }\
        END { print ""}'
}  # }}}

list_runs()  # {{{
{
    RunList=$GDACRootURL/info/firehose_get_public_runs_list.txt
    Download -f $RunList | sed 's/^[ 	]*#.*$//g ; s/^$//g ; s/^[ 	][ 	]*$//g'
}   # }}}

list_run_types()  # {{{
{
    echo `list_runs | awk -F__ '{print $1}' | sort -u` | pretty_print 4
}   # }}}

list_tasks()  # {{{
{
    # Note that validate_and_prepare() must be called first; and
    # that extraneous arguments (like disease cohort) are ignored

    # Fabricate date without underscores, then find all tasks for that run/date

    UDate=`echo $RunDate | tr -d _`

    if [ "$CohortsToDownload" = "$AllTumors" ] ; then
        # If subset of disease cohorts has NOT been selected, use OV, BRCA, and
        # PANCAN12 as representative, because none have all data types/analyses
        TumorList="OV BRCA PANCAN12"
    fi

    echo "Scanning tasks listed at Broad GDAC, this may take a few moments ..." >&2

    for cohort in $CohortsToDownload ; do
        List="${List}`Download -f $DataURL/$cohort/$UDate`"
    done

    Tasks=`echo "${List}" | sed 's/.*"\(gdac.*.Level_.*\)".*/\1/g' | \
            egrep -v "<|>|md5" | cut -d. -f4 | sort -u -f`

    if [ -z "$Tasks" ] ; then
        echo "No tasks/archives found in $WhichRun for selected disease cohorts"
    else
        echo "$Tasks"
    fi
}   # }}}

list_cohorts()  # {{{
{
    Download -f $GDACRootURL/info/firehose_get_disease_cohorts_list.txt | \
        sed 's/^[ 	]*#.*$//g ; s/^$//g ; s/^[ 	][ 	]*$//g; s/PANCAN18//g' | \
        pretty_print 10
}   # }}}

finalize()  # {{{
{
    # Clean up anything superfluous, but do it VERY SAFELY
    test -z "$Echo" && cd ./$WhichRun
    if [ -z "$Echo" ] && [ "`basename $PWD`" != "$WhichRun" ] ; then
        echo "Could not enter $WhichRun output directory, aborting now ..."
        exit 6
    fi

    Do "find . -name index.html -exec \rm -f {} \;"
    Do "find . -name robots.txt -exec \rm -f {} \;"

    for cohort in $CohortsToDownload ; do
        Downloaded=`find $cohort -name 'gdac.*'`
        if [ -z "$Downloaded" ] ; then
            printf "\n\nNothing downloaded.  Please check your inputs, or use"
            printf "\n-runs and/or -tasks to see what's available for $RunDate.\n"
        fi
    done

}   # }}}

validate_and_prepare()  # {{{
{
    case $1 in

        $AnalysisRun | analysis)
                KindOfRun=$AnalysisRun
                ;;

        $DataRun | data)
                KindOfRun=$DataRun
                ;;

        awg_pancan8)
                AllCohorts="BRCA COAD COADREAD GBM KIRC LUSC OV READ UCEC PANCAN8"
                KindOfRun=$1
                ;;

        *)
                if [ -z "$1" ] ; then instruct ; fi

                echo "Validating run selection against Broad Institute website ..."
                Runs=`list_runs`
                if [ -z "`echo $Runs | egrep \"$1\" 2>/dev/null`" ] ; then
                    printf "\nPlease use -help or select exactly 1 run type from\n"
                    printf "\n`list_run_types`\n"
                    printf "\nand note that 'analysis' and 'data' are accepted as"
                    printf "\nsynonyms for 'analyses' and 'stddata', respectively.\n"
                    exit 7
                fi

                KindOfRun=$1

                # Guess disease type from run name
                AllCohorts=`echo $1 | cut -d_ -f2 | ToUpper`
                ;;
    esac

    case $2 in
        latest)
            printf "# Mapping ${KindOfRun}__latest to exact version: " >&2
            RunDate=`list_runs | grep $KindOfRun | tail -1 | cut -d" " -f1` 
            RunDate=`echo $RunDate | awk -F__ '{print $2}'`
            echo "${KindOfRun}__${RunDate}" >&2
            ;;
        *)
            RunDate=`echo $2 | sed -n "/^${YYYYMMDD}$/p"`
            if [ -z "$RunDate" ] ; then
                echo "Please supply a valid date in YYYY_MM_DD format."
                exit 3
            fi
            ;;
    esac

    shift 2

    if [ -z "$AllCohorts" ] ; then
        AllCohorts=`list_cohorts`
    fi

    CohortsToDownload=
    for c in "$@" ; do
        c=`echo $c | ToUpper`
        CohortsToDownload="$CohortsToDownload $c"
    done

    if [ -z "$CohortsToDownload" ] ; then
        CohortsToDownload="$AllCohorts"
    fi

    WhichRun=${KindOfRun}__${RunDate}
    DataURL=$GDACRootURL/$WhichRun/data
    look_for_downloader

    if [ -n "$TasksToDownload" ] ; then
        TasksToDownload=`echo $TasksToDownload | tr -s ' '`
        if [ -z "$TasksToDownload" ] ; then
            # Do() not used here to avoid capturing stderr to stdout if
            # caller wishes to save 'firehose_get -tasks' to a variable
            eval $Echo list_tasks $@
            exit 0
        fi
    fi

    printf "You've asked to download archives for the following "
    if [ -n "$TasksToDownload" ] ; then
        printf "tasks\n\n"
        printf "     $TasksToDownload\n\n"
        printf "run against the "
    fi
    printf "disease cohorts\n\n"
    printf "     $CohortsToDownload\n\n"
    printf "from the $WhichRun Firehose run. "
   
    if (($BatchMode)) ; then return ; fi

    printf "If this is correct,\n"
    read -p "shall we continue with download? (y|yes|n|no) [no] " answer
    case "${answer}" in
        y|yes) ;;
        *)
            printf "Ok, we'll try again some other time.\n"
            exit 0
            ;;
    esac
}   # }}}

instruct()  # {{{
{
    echo "$This : retrieve open-access results of Broad Institute TCGA GDAC runs"
    echo "Version: $Version (Author: Michael S. Noble)"
    echo
    echo "Usage: $This [flags]  RunType  Date  [disease_cohort, ... ]"
    echo
    echo "Two arguments are required; the first must be one of"
    echo
    list_run_types
    echo
    echo "while the second must EITHER be a date (in YYYY_MM_DD form) of an"
    echo "existing GDAC run of the given type OR 'latest'; use the -runs flag"
    echo "to discern what RunType+Date combinations are available.  An optional"
    echo "3rd, 4th etc argument may be specified to prune the retrieval, given"
    echo "as a subset of these case-insensitive TCGA disease cohort names:"
    echo
    list_cohorts
    echo
    echo "Note that as a convenience 'analysis' and 'data' are accepted as"
    echo "synonyms for the 'analyses' and 'stddata' run types"
    echo

    if [ -z "$1" ] ; then
        echo "Type $This --help for more information and options."
        echo
        exit 2
    fi

    echo "Flags:"
    echo
    echo "  -b | -batch         do not prompt: assume YES answer to all queries"
    echo "  -c | -cohorts       list available disease cohorts"
    echo "  -e | -echo          show commands that would be run, but do nothing"
    echo "  -h | -help | --help this message"
    echo "  -l | -log           write output to log file, instead of stdout"
    echo "  -p | -platforms     list data platforms available in Firehose runs"
    echo "                      (not implemented yet)"
    echo "  -r | -runs          list available Firehose runs"
    echo "  -t | -tasks <list>  further prune the set of archives retrieved, by"
    echo "                      INCLUDING only the tasks (pipelines) whose"
    echo "                      names match the given space-delimited list of"
    echo "                      patterns; matching is performed with glob-style"
    echo "                      wildcards, and is case-insensItive; prepending"
    echo "                      a tilde (i.e. ~) to a task name will cause it"
    echo "                      to be EXCLUDED from download; when no pattern"
    echo "                      list is given $This will display all tasks in"
    echo "                      the selected run."
    echo
    echo "                      NOTE: not all tasks will execute for all disease"
    echo "                            cohorts; what tasks are run depends upon the"
    echo "                            data available for that disease cohort"
    echo "  -v                  display the version of $This"
    echo "  -x                  debugging: turn on bash set -x (warning: very verbose)"
    echo 
    echo "Broad GDAC website:   http://gdac.broadinstitute.org"
    echo "Broad GDAC email  :   gdac@broadinstitute.org"
    echo
    exit 2
}  # }}}

# Command line processing {{{

look_for_downloader

while true ; do
    case $1 in 

        -b|-batch)  BatchMode=1 ;;

        -c|-cohorts) 

            Do list_cohorts
            exit 0
            ;;

        -e|-echo)

            Echo=echo
            echo "Echo mode: commands that would be performed are echoed to"
            echo "stdout, but Broad GDAC archives will not be retrieved"
            echo
            Capture=
            ;;

        -h|-help|--help)
        
            instruct -long
            ;;

        -l|-log)

            if [ -z "$Echo" ] ; then
                Logfile=./`UniqueSuffix ${This}-${LOGNAME}.log`
                echo Logging output to $Logfile ...
                Capture=">> $Logfile 2>&1"
            fi
            ;;

        -p|-platforms)
            echo "-p | -platforms option not implemented yet"
            exit 0
            ;;

        -r|-runs)

            Do list_runs
            exit 0
            ;;

        -t|-tasks)

            shift

            if [ $# -eq 0 ] ; then 
                printf "Nothing specified for -tasks, ignoring ...\n\n"
                continue
            fi

            TasksToDownload=" "
            while true ; do
                case $1 in
                    -*)             break ;;
                    stddata|data)   break ;;
                    analys[ie]s)    break ;;
                    awg*)           break ;;    # incomplete, but ok for now
                    *)              TasksToDownload="$TasksToDownload $1"
                                    shift
                                    ;;
                esac
            done
            continue
            ;;

        -v|-version|--version)

            echo $Version
            exit 0
            ;;

        -x|-verbose)

            set -x
            ;;

        *)  break ;;

    esac
    shift
done
# }}}

# Main {{{

validate_and_prepare "$@"

printf "\nAttempting to retrieve data for Broad GDAC run $WhichRun ...\n"

# Grab archives by iterating over each disease cohort
for cohort in $CohortsToDownload ; do
    Do Download $DataURL/$cohort/
done

if [ -z "$Echo" ] && [ ! -d ./$WhichRun ] ; then
    echo "Uh oh: there seems to have been an error, aborting now ..."
    exit 5
fi

printf "\n\nNow performing post-processing on retrieved files ...\n"

finalize

# }}}
