#!/bin/bash

#===============================================================================
# The Broad Institute
# SOFTWARE COPYRIGHT NOTICE AGREEMENT
# This software and its documentation are copyright 2012, 2013 by the
# Broad Institute/Massachusetts Institute of Technology. All rights reserved.
#
# This software is supplied without any warranty or guaranteed support whatsoever.
# Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
# functionality.
# 
# @author: Michael S. Noble
# @date:   August 28, 2015
#===============================================================================

#  Front matter: env setup, command line args, etc {{{

FilterString="^http|^Length|saved|html|^$|robots|resolv|connect"
Capture="2>&1 | egrep -vi \"$FilterString\""
Version=0.4.6
BatchMode=0
This=`basename $0`
DataRun=stddata
AnalysisRun=analyses
Downloaders="wget curl"
KindOfRun=$AnalysisRun	                # Default to analysis run
RANDOM=$$                               # Initialize random number generator
GDACRootURL=https://gdac.broadinstitute.org/runs
YYYYMMDD="[2-9][0-9][0-9][0-9]_[0-1][0-9]_[0-3][0-9]"
UniqueID=${This}-${LOGNAME}-$$

shopt -s expand_aliases extglob
set -o allexport -o errexit -o pipefail
alias ToUpper="tr '[:lower:]' '[:upper:]'"
alias ToLower="tr '[:upper:]' '[:lower:]'"
alias Tmpfile="echo /tmp/${UniqueID}-\${RANDOM}"
trap '\rm -f /tmp/${UniqueID}*' EXIT    # Do not leave tmp files laying around

# }}}

UniqueSuffix()  # {{{
{
   # This allows multiple inodes to safely co-exist in same top-level
   # dir, by tagging each with unique numeric suffix (starting at 2).

   if [ ! -e $1 ] ; then	# can be file/directory/link/etc
	echo $1
	return
   fi

   i=2
   while [ $i -lt 999999 ] ; do
	if [ -e ${1}.$i ] ; then
	   i=`expr $i + 1`
 	else 
	   break
	fi
   done
   echo ${1}.$i

}  # }}}

Do()  # {{{
{
    eval $Echo "$@" $Capture
}   # }}}

Download()  # {{{
{
    set +o errexit
    eval $Downloader "$@"
    set -o errexit
}   # }}}

downloader_url_not_found_wget()  # {{{
{
    test -n "`wget --spider $1 2>&1 | grep 404`"
    return $?
}   # }}}

downloader_wget()  # {{{
{            
    case $1 in
        -f) Opts="-q -O -" ; shift ;;       # quietly send straight to stdout
         *) Opts="--progress=dot:mega -l 2 --no-parent "
            Opts="$Opts -nH --cut-dirs=3 -P $WhichRun -r"
            ;;
    esac

    # Stop wget from trying to download index.html or robots.txt files
    Opts="$Opts -R index.html* -e robots=off"

    eval wget --ignore-case --cache=off $Opts $Tasks $@
    status=$?

    if [ $status -eq 0 ] ; then return ; fi

    echo " "
    case $status in
        6)
            echo "Could not access protected run; please see the -a|-auth"
            echo "command line flag as described in the -help output."
            ;;

        8)  if downloader_url_not_found_wget $@ ; then
                # This is a forgivable server error condition: runs are
                # not guaranteed to contain data/analyses for all cohorts
                cohort=`basename $1`
                echo "No $cohort data / results available in $WhichRun run"
                return
            fi
            echo "Unrecoverable wget Error $status: please see wget docs for details"
            ;;
        *)
            echo "wget Error $status: please see wget docs for details"
            ;;
    esac

    echo " "
    exit $status

}   # }}}

downloader_auth_wget()  # {{{
{
    while true ; do

        case $1 in

        *:*)
            # user:passwd explicitly given, always overrides everything
            user=`echo $1 | cut -d: -f1`
            passwd=`echo $1 | cut -d: -f2`
            break
            ;;

        "")
            #  No user:passwd given, so look for defaults or prompt
            if [ -n "$FHGETRC" ] ; then
                # FIXME: supports only wgetrc syntax, needs netrc support too;
                #        latter would be parsed for generating WGETRC below
                WGETRC=$FHGETRC
                break
            fi

            # NOTE: -batch not n/a here (b/c it's not a simple YES/NO question)
            read -p "Username: " user
            read -s -p "Password: " passwd
            printf "\n\n"
            if [ "$user" = "" ] || [ "$passwd" = "" ] ; then
                echo "-auth Error: username/password cannot be empty"
            else
                break
            fi
            ;;

        *)
            # Path to file
            WGETRC=$1
            break
            ;;

        esac
    done

    # For security, write to WGETRC file (which will get cleaned up at exit)
    if [ -z "$WGETRC" ] ; then
        tmpfile=`Tmpfile`
        echo "http_user = $user" > $tmpfile
        echo "http_password = $passwd" >> $tmpfile 
        echo "dot_bytes = 100m" >> $tmpfile 
        WGETRC=$tmpfile
    fi

    if [ ! -f $WGETRC ] ; then
        echo "-auth Error: $WGETRC does not exist or is not a regular file"
        exit 9
    fi

    export WGETRC

}   # }}}

downloader_determine()  # {{{
{

    set +o pipefail

    for Downloader in $Downloaders ; do
        Loc=`type $Downloader 2>/dev/null | awk '{print $NF}'`
        if [ -n "$Loc" ] ; then
            break
        fi
    done

    case $Loc in
        */curl)

            echo "curl not supported yet: please find wget on your system"
            echo "and make sure it is in your \$PATH"
            exit 5
            ;;

        */wget)

            Downloader=downloader_wget

            # Never use default user WGETRC: prefer FHGETRC for that.  Note
            # that this means downloader_auth CANNOT be run before this func.
            export WGETRC=
            ;;

        *)
            echo "Couldn't find one of <$Downloaders> in your \$PATH, aborting"
            exit 4
            ;;
    esac

    set -o pipefail

}   # }}}

downloader_auth()  # {{{
{
    case $Downloader in
        *wget) downloader_auth_wget $@ ;;
        *) exit 8 ;;                        # should never make it here
    esac

}  # }}}

downloader_parse_tasks()  # {{{
{
    case $Downloader in
        *wget) 

            Tasks=
            for Task in $TasksToDownload ; do
                Tasks=",*${Task}*${Tasks}"
            done

            if [ -n "$Tasks" ] ; then
                # First see if -tasks was to include or reject (-A /-R) matches
                IncludeExclude="`echo $Tasks | sed 's/~//g'`"
                if [ "$Tasks" = "$IncludeExclude" ] ; then
                    Tasks="-A \"`echo $Tasks | cut -d, -f2-`\""
                else
                    Tasks="-R \"`echo $IncludeExclude | cut -d, -f2-`\""
                fi
            fi
            ;;

        *) exit 8 ;;                        # should never make it here
    esac

}  # }}}

pretty_print() # {{{
{
    export N=$1

    awk 'BEGIN { printf "\t" } \
        { i=0; while (i<NF) { \
                printf $++i"  "; \
                if (!(i % ENVIRON["N"])) printf "\n\t"} }\
        END { print ""}'
}  # }}}

list_runs()  # {{{
{
    RunList=$GDACRootURL/info/firehose_get_public_runs_list.txt
    Download -f $RunList | sed 's/^[ 	]*#.*$//g ; s/^$//g ; s/^[ 	][ 	]*$//g'
}   # }}}

list_run_types()  # {{{
{
    echo `list_runs | awk -F__ '{print $1}' | sort -u` | pretty_print 4
}   # }}}

list_tasks()  # {{{
{
    # Note that validate_and_prepare() must be called first; and
    # that extraneous arguments (like disease cohort) are ignored

    # Fabricate date without underscores, then find all tasks for that run/date

    UDate=`echo $RunDate | tr -d _`

    if [ "$CohortsToDownload" = "$AllCohorts" ] ; then
        # If subset of disease cohorts has NOT been selected, use OV and
        # THCA as representative, because none have all data types/analyses
        CohortsToDownload="OV THCA"
    fi

    echo "Scanning tasks listed at Broad GDAC, this may take a few moments ..." >&2

    List=`Tmpfile`
    for cohort in $CohortsToDownload ; do
        echo "`Download -f $DataURL/$cohort/$UDate`" >> $List
    done

    Tasks=`sed -n 's/^.*href="\(gdac.*.Level_.*gz\)".*$/\1/p' $List | \
           egrep -v "<|>|md5" | cut -d. -f4 | sort -u -f`

    if [ -z "$Tasks" ] ; then
        echo "No tasks/archives found in $WhichRun for selected disease cohorts"
    else
        echo "$Tasks"
    fi
}   # }}}

list_cohorts()  # {{{
{
    Download -f $GDACRootURL/info/firehose_get_disease_cohorts_list.txt | \
        sed 's/^[ 	]*#.*$//g ; s/^$//g ; s/^[ 	][ 	]*$//g; s/PANCAN18//g' | \
        pretty_print 8
}   # }}}

nothing_downloaded()  # {{{
{
    printf "\n\nNothing downloaded for $1.  Please see --help, check your"
    printf "\ninputs, and use -runs and/or -tasks to see what's available.\n"
}   # }}}

no_such_run()  # {{{
{
    printf "\nError: <$@> does not refer to an existing Firehose run\n"
    printf "\nPlease use -help or select exactly 1 run type from\n"
    printf "\n`list_run_types`\n"
    printf "\nand note that 'analysis' and 'data' are accepted as"
    printf "\nsynonyms for 'analyses' and 'stddata', respectively.\n"
    exit 7
}   # }}}

finalize()  # {{{
{
    # Clean up anything superfluous, but do it VERY SAFELY
    test -z "$Echo" && cd ./$WhichRun
    if [ -z "$Echo" ] && [ "`basename $PWD`" != "$WhichRun" ] ; then
        echo "Could not enter $WhichRun output directory, aborting now ..."
        exit 6
    fi

    set +o errexit +o pipefail
    Do "find . -name index.html -exec \rm -f {} \;"
    Do "find . -name robots.txt -exec \rm -f {} \;"

    for cohort in $CohortsToDownload ; do
        Downloaded=`find $cohort -name 'gdac.*' 2>/dev/null`
        if [ -z "$Downloaded" ] ; then
            nothing_downloaded $cohort
        fi
    done

}   # }}}

validate_and_prepare()  # {{{
{
    # First:  validate run type
    case $1 in

        $AnalysisRun | analysis)
                KindOfRun=$AnalysisRun
                ;;

        $DataRun | data)
                KindOfRun=$DataRun
                ;;

        awg_pancan8)
                AllCohorts="BRCA COAD COADREAD GBM KIRC LUSC OV READ UCEC PANCAN8"
                KindOfRun=$1
                ;;

        *)
                if [ -z "$1" ] ; then instruct ; fi

                KindOfRun=$1
                # Guess disease type from run name
                AllCohorts=`echo $1 | cut -d_ -f2 | ToUpper`

                ;;
    esac

    Runs=`list_runs`

    # Second: validate run date
    case $2 in
        latest)
            printf "# Mapping ${KindOfRun}__latest to exact version: " >&2
            RunDate=`echo "$Runs" | grep $KindOfRun | tail -1 | cut -d" " -f1` 
            RunDate=`echo $RunDate | awk -F__ '{print $2}'`
            echo "${KindOfRun}__${RunDate}" >&2
            ;;
        *)
            RunDate=`echo $2 | sed -n "/^${YYYYMMDD}$/p"`
            if [ -z "$RunDate" ] ; then
                echo "Please supply a valid date in YYYY_MM_DD format."
                exit 3
            fi
            ;;
    esac

    shift 2
    WhichRun=${KindOfRun}__${RunDate}
    DataURL=$GDACRootURL/$WhichRun/data

    # Third: now 
    echo "Validating run selection against Broad Institute website ..."
    if [ -z "`echo $Runs | egrep \"$WhichRun\" 2>/dev/null`" ] ; then
        no_such_run $WhichRun
    fi

    if [ -z "$AllCohorts" ] ; then
        AllCohorts=`list_cohorts`
    fi

    CohortsToDownload=
    for c in "$@" ; do
        c=`echo $c | ToUpper`
        CohortsToDownload="$CohortsToDownload $c"
    done

    if [ -z "$CohortsToDownload" ] ; then
        CohortsToDownload="$AllCohorts"
    fi

    downloader_parse_tasks

    if [ -n "$TasksToDownload" ] ; then
        TasksToDownload=`echo $TasksToDownload | tr -s ' '`
        if [ -z "$TasksToDownload" ] ; then
            # Do() not used here to avoid capturing stderr to stdout if
            # caller wishes to save 'firehose_get -tasks' to a variable
            eval $Echo list_tasks $@
            exit 0
        fi
    fi

    printf "You've asked to download archives for the following "
    if [ -n "$TasksToDownload" ] ; then
        printf "tasks\n\n"
        printf "     $TasksToDownload\n\n"
        printf "run against the "
    fi
    printf "disease cohorts\n\n"
    printf "     $CohortsToDownload\n\n"
    printf "from the $WhichRun Firehose run. "
   
    if (($BatchMode)) ; then return ; fi

    printf "If this is correct,\n"
    read -p "shall we continue with download? (y|yes|n|no) [no] " answer
    case "${answer}" in
        y|yes) ;;
        *)
            printf "Ok, we'll try again some other time.\n"
            exit 0
            ;;
    esac
}   # }}}

instruct()  # {{{
{
    echo "$This : retrieve open-access results of Broad Institute TCGA GDAC runs"
    echo "Version: $Version (Author: Michael S. Noble)"
    echo
    echo "Usage: $This [flags]  RunType  Date  [disease_cohort, ... ]"
    echo
    echo "Two arguments are required; the first must be one of"
    echo
    list_run_types
    echo
    echo "while the second must EITHER be a date (in YYYY_MM_DD form) of an"
    echo "existing GDAC run of the given type OR 'latest'; use the -runs flag"
    echo "to discern what RunType+Date combinations are available.  An optional"
    echo "3rd, 4th etc argument may be specified to prune the retrieval, given"
    echo "as a subset of these case-insensitive TCGA disease cohort names:"
    echo
    list_cohorts
    echo
    echo "(taken from https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm)"
    echo
    echo "Note that as a convenience 'analysis' and 'data' are accepted as"
    echo "synonyms for the 'analyses' and 'stddata' run types"
    echo

    if [ -z "$1" ] ; then
        echo "Type $This --help for more information and options."
        echo
        exit 2
    fi

    echo "Flags:"
    echo
    echo "  -a | -auth [cred]   authorize the retrieval of password-protected"
    echo "                      results; the optional cred[entials] parameter"
    echo "                      must be one of"
    echo
    echo "                              1) a username:password string"
    echo "                              2) /a/path/to/a/wgetrc/file"
    echo "                              3) the empty string"
    echo 
    echo "                      If no credentials are supplied (empty string),"
    echo "                      then FHGETRC will be used if it is set in the"
    echo "                      environment and points to a regular file (which"
    echo "                      must be in WGETRC-conformant syntax); otherwise"
    echo "                      a username:password prompt will be issued.  If"
    echo "                      both \$FHGETRC is set in the environment AND a"
    echo "                      username:password parameter is specified here,"
    echo "                      then \$FHGETRC will be ignored"
    echo "  -b | -batch         do not prompt: assume YES to all YES/NO queries"
    echo "  -c | -cohorts       list available disease cohorts"
    echo "  -e | -echo          show commands that would be run, but do nothing"
    echo "  -h | -help | --help this message"
    echo "  -l | -log           write output to log file, instead of stdout"
    echo "  -o | -only <list>   further prune the set of archives retrieved, by"
    echo "                      INCLUDING ONLY results of pipelines whose names"
    echo "                      names match any of the given space-delimited list"
    echo "                      of patterns; matching is performed with glob-style"
    echo "                      wildcards, and is case-insensItive; prepending"
    echo "                      a tilde (i.e. ~) to a task name will cause it"
    echo "                      to be EXCLUDED from download; when no pattern"
    echo "                      list is given $This will display all tasks in"
    echo "                      the selected run."
    echo
    echo "                      NOTE: not all tasks will execute for all disease"
    echo "                            cohorts; what tasks are run depends upon the"
    echo "                            data available for that disease cohort"
    echo "  -p | -platforms     list data platforms available in Firehose runs"
    echo "                      (not implemented yet)"
    echo "  -r | -runs          list available Firehose runs"
    echo "  -t | -tasks <list>  same as -o|-only flag (kept for back-compatibility)"
    echo "  -v                  display the version of $This"
    echo "  -x                  debugging: turn on bash set -x (warning: very verbose)"
    echo 
    echo "Broad GDAC website:   http://gdac.broadinstitute.org"
    echo "Broad GDAC email  :   gdac@broadinstitute.org"
    echo
    exit 2
}  # }}}

# Command line processing {{{

downloader_determine

while true ; do
    case $1 in 

        -a|-auth)

            DLAUTH=
            case $2 in
                *:*) DLAUTH=$2 ; shift ;;
                  *)  ;;                    # no-op: do not shift arg list
            esac

            downloader_auth $DLAUTH
            ;;

        -b|-batch)  BatchMode=1 ;;

        -c|-cohorts) 

            Do list_cohorts
            exit 0
            ;;

        -e|-echo)

            Echo=echo
            echo "Echo mode: commands that would be performed are echoed to"
            echo "stdout, but Broad GDAC archives will not be retrieved"
            echo
            Capture=
            ;;

        -h|-help|--help)
        
            instruct -long
            ;;

        -l|-log)

            if [ -z "$Echo" ] ; then
                Logfile=./`UniqueSuffix ${This}-${LOGNAME}.log`
                echo Logging output to $Logfile ...
                Capture=">> $Logfile 2>&1"
            fi
            ;;

        -p|-platforms)
            echo "-p | -platforms option not implemented yet"
            exit 0
            ;;

        -r|-runs)

            Do list_runs
            exit 0
            ;;

        -t|-tasks|-o|-only)

            shift

            if [ $# -lt 2 ] ; then
                printf "Incomplete input for -tasks, ignoring ...\n\n"
                continue
            fi

            TasksToDownload=" "
            while true ; do
                case $1 in
                    -*)             break ;;
                    stddata|data)   break ;;
                    analys[ie]s)    break ;;
                    awg*)           break ;;    # incomplete, but ok for now
                    *)
                            if [ -z "$1" ] ; then
                                no_such_run "`echo $TasksToDownload | cut -d' ' -f2-99`"
                            fi
                            TasksToDownload="$TasksToDownload $1"
                            shift
                            ;;
                esac
            done
            continue
            ;;

        -v|-version|--version)

            echo $Version
            exit 0
            ;;

        -x|-verbose)

            set -x
            ;;

        *)  break ;;

    esac
    shift
done
# }}}

# Main {{{

validate_and_prepare "$@"

printf "\nAttempting to retrieve data for Broad GDAC run $WhichRun ...\n"

# Grab archives by iterating over each disease cohort
for cohort in $CohortsToDownload ; do
    Do Download $DataURL/$cohort/
done

if [ -z "$Echo" ] && [ ! -d ./$WhichRun ] ; then
    nothing_downloaded $WhichRun && exit 9
fi

printf "\n\nNow performing post-processing on retrieved files ...\n"

finalize

# }}}
