ensref

#!/bin/bash

### legal

# copyright (c) 2016, the Bioinformatics Core Facility of the Max Planck Institute for Biology of Ageing, Cologne
# <bioinformatics at age dot mpg dot de>

### about

# ensref - build and index genome data bases from ensembl
#
# base folder: 
# - /<base directory>/<organism name>/<release number>/
# base files:
# - BUILD_XX_RELEASE_YY = build and release version info, empty file
# - original.gtf = original reference gtf file
# - original.*.gtf = varying reference gtf file
# - original.*.fa = dna sequences fasta files (e.g. toplevel, ...)
# - cuffcompare.gtf = original.gtf annotated with tophat/cuffcompare
# index files:
# - <dna reference level>_<aligner>, e.g.
# - toplevel_tophat2/index.fa = tophat2 index base name
# - toplevel_tophat2_cuffcompare/index.fa = tophat2/cuffcompare index base name
# log files:
# - log/*.log = stdout and stderr log of index build scripts
# - log/*.sh = index build scripts

### functions

get_organism () {
  curl -#l ftp://ftp.ensembl.org/pub/current_fasta/
  # alternative: ncftpls -l ftp://ftp.ensembl.org/pub/current_fasta/
}

get_releases () {
  curl -#l ftp://ftp.ensembl.org/pub/ | grep release | sed 's|release-||' | sort
}

get_current () {
  echo "* getting latest release"
  gtf=$(curl -#l ftp://ftp.ensembl.org/pub/current_gtf/$1/ | grep gtf.gz | uniq)
  relA=$(echo $gtf | cut -f2 -d.)
  relB=$(echo $gtf | cut -f3 -d.)
  sep=_
  rel=$relA$sep$relB 
  echo $rel
}

### variables and options

unset LIST
unset RELEASE
unset ORGANISM
BIN=$(basename $0)
BASE="/beegfs/common/genomes"
VER_STAR="2.5.2b"
USAGE="USAGE:
  $BIN [OPTIONS]
OPTIONS:
  -h, --help                : show this help and exist
  -l, --list                : return available organisms/release-ids
  -b, --base, --prefix PATH : select the base directory, (default: $BASE)
  -o, --organism ORGANISM   : define organism, e.g. 'caenorhabditis_elegans'
  -r, --release RELEASE     : define release version, e.g. '81'
                              (default: automatic detection of current version)
  --ver-star VERSION        : version of STAR (default $VER_STAR)
  -n, --no-slurm            : use bash to run scripts, not sbatch from slurm
                              (not yet integrated)
  -c, --cpus NUMBER         : define number of CPUs (for STAR, default: 12)
                              (not yet integrated)
"

while [[ $# -gt 0 ]]
do
  case $1 in
    -h|--help)
      echo "$USAGE"
      exit
      ;;
    -l|--list)
      LIST=true
      ;;
    -b|--base|--prefix)
      BASE=$2
      shift
      ;;
    -o|--organism)
      ORGANISM=$2
      shift
      ;;
    -r|--release)
      RELEASE=$2
      shift
      ;;
    --version-star)
      VER_STAR=$2
      shift
      ;;
    -n|--no-slurm)
      unset SLURM
      echo "* warning: option '-n' not yet integrated"
      ;;
    -c|--cpus)
      CPUS=$2
      shift
      echo "* warning: option '-c' not yet integrated"
      ;;
    *)
      echo "error: unsupported option '$1'. See '$(basename $0) --help'."
      exit 65
      ;;
  esac
  shift
done

### check base folder

if [[ ! -d $BASE || ! -x $BASE || ! -w $BASE ]]
then
  echo "error: base directory '$BASE' not found/accessable/writeable"
  exit 65
fi

### load software environment
# adjust this for your environment

module purge
echo "* loading software environment"
module load slurm
module load bowtie
module load tophat
module load bwa
module load star/${VER_STAR}
#module load hisat
module load hisat
module load cufflinks
VER_STAR_ERROR=$(module load star/$VER_STAR 2>&1)
if [[ -n $VER_STAR_ERROR ]]
then
  echo "error: STAR module version '$STAR_VER' not available"
  exit 65
fi

### check organism

echo "* fetching organism list"
ORGANISMS=($(get_organism))
echo "* fetching release list"
RELEASES=($(get_releases))
CURRENT=${RELEASES[${#RELEASES[@]}-1]}

if [[ $LIST ]]
then
  echo
  echo "* available organisms:"
  printf '%s\n' "${ORGANISMS[@]}" | column
  echo
  echo "* available releases:"
  printf '%s\n' "${RELEASES[@]}" | column
  echo
  exit
fi

if [[ -z $ORGANISM || -z $(echo ${ORGANISMS[@]} | grep $ORGANISM) ]]
then
  echo "* error: organism '$ORGANISM' not available"
  exit 65
fi

### check release

if [[ ! $RELEASE ]]
then
  RELEASE=$CURRENT
fi

if [[ -z $RELEASE || -z $(echo ${RELEASES[@]} | grep -o $RELEASE) ]]
then
  echo "Error: release '$RELEASE' not available"
  exit 65
fi

URL_GTF="ftp://ftp.ensembl.org/pub/release-$RELEASE/gtf/$ORGANISM/"
URL_DNA="ftp://ftp.ensembl.org/pub/release-$RELEASE/fasta/$ORGANISM/dna/"
TARGET=$BASE/$ORGANISM/$RELEASE

### notify

echo "* settings:
  ORG : $ORGANISM
  REL : $RELEASE (current: $CURRENT)
  URL : $URL_GTF
  DIR : $TARGET
"

### generate target directory

if [[ -d $TARGET ]]
then
  echo "* target directory '$TARGET' exists"
fi
mkdir -p $TARGET
if [[ ! -x $TARGET || ! -w $TARGET ]]
then
  echo "error: target directory '$TARGET' not accessable/writeable"
  exit 65
fi
cd $TARGET

### download and extract

echo "* fetching file lists"
FILES_GTF=($(curl -#l $URL_GTF | grep "\.gtf\.gz$"))
FILES_DNA=($(curl -#l $URL_DNA | grep "\.dna\."))
BUILD=${FILES_GTF[0]#*.}
BUILD=${BUILD%%.*}
touch BUILD_${BUILD}_RELEASE_${RELEASE}

echo "* downloading and extracting"

echo "  README (gtf)"
curl -# $URL_GTF/README > README_gtf

for FILE in ${FILES_GTF[@]}
do
  FILE_UNZIP=${FILE%%.gz}
  FILE_REBASED=original.${FILE_UNZIP#*.*.*.}
  echo "  ${FILE%%.gz} : "
  #[[ -f $FILE_UNZIP ]] && echo "exists" && continue
  [[ -f $FILE_REBASED ]] && echo "exists" && continue
  curl -#O $URL_GTF/$FILE
  srun -p dontuseme gunzip $FILE
  mv $FILE_UNZIP $FILE_REBASED
done

echo "  README (fasta)"
curl -# $URL_DNA/README > README_fa

mkdir -p chromosomes
for FILE in ${FILES_DNA[@]}
do
  FILE_UNZIP=${FILE%%.gz}
  FILE_CHROM=chromosomes/${FILE_UNZIP##*chromosome.}
  FILE_REBASED=original.${FILE_UNZIP#*.*.*.}
  echo "  ${FILE_UNZIP} : "
  [[ -f ${FILE_REBASED} || -f ${FILE_CHROM} ]] && echo "exists" && continue
  [[ $FILE =~ nonchromosomal && -f chromosomes/nonchromosomal.fa ]] && echo "exists" && continue
  curl -#O $URL_DNA/$FILE
  srun -p dontuseme gunzip $FILE
  if [[ $FILE =~ chromosome ]]
  then
    mv $FILE_UNZIP $FILE_CHROM
  elif [[ $FILE =~ nonchromosomal ]]
  then
    mv $FILE_UNZIP chromosomes/nonchromosomal.fa
  else
    mv $FILE_UNZIP $FILE_REBASED
  fi
done

### align

# original.toplevel.fa original.primary_assembly.fa
mkdir -p log

for gtf in original.gtf
do
  # if adding more gtfs, PREpend label to $name
  # e.g.
  # label_gtf=${gtf#original.}
  # label_gtf=${label_gtf%.gtf}
  # [[ -z $label_gtf ]] && label_gtf="default"
  
  #for fa in original.*.fa
  # restrict to toplevel/primary_assembly
  # 'alt' is not explained
  for fa in original.toplevel.fa original.primary_assembly.fa
  do
    label_fa=${fa#original.}
    label_fa=${label_fa%.fa}
    
    #######
    ### bowtie2
    #######
  
    name=${label_fa}_bowtie2
    if [[ -d $name ]]
    then
      echo "* indexing $name: skipped (remove folder to rebuild)"
    else
      echo "* indexing $name"
      jobname=ensref_${ORGANISM}_${RELEASE}_$name
      cmd="#!/bin/bash
date
which bowtie2-build-s
mkdir $name
cd $name
ln -s ../$fa index.fa
bowtie2-build-s index.fa index.fa
echo done
"
      echo "$cmd" > log/${name}.sh
      chmod u+x log/${name}.sh
      sbatch -p dontuseme -o log/${name}.log -J $jobname log/${name}.sh
    fi
  
    #######
    ### bwa
    #######
  
    name=${label_fa}_bwa
    if [[ -d $name ]]
    then
      echo "* indexing $name: skipped (remove folder to rebuild)"
    else
      echo "* indexing $name"
      jobname=ensref_${ORGANISM}_${RELEASE}_$name
      cmd="#!/bin/bash
date
which bwa
mkdir $name
cd $name
ln -s ../$fa index.fa
bwa index -a bwtsw -p index.fa index.fa
echo done
"
  #bwa index -a bwtsw -p $TARGET/$name/${fa%.fa} $fa
      echo "$cmd" > log/${name}.sh
      chmod u+x log/${name}.sh
      pid_bowtie2=$(sbatch -p dontuseme -o log/${name}.log -J $jobname log/${name}.sh)
      pid_bowtie2=${pid_bowtie2##* }
    fi
  
    #######
    ### star
    #######
  
    name=${label_fa}_star_$VER_STAR
    if [[ -d $name ]]
    then
      echo "* indexing $name: skipped (remove folder to rebuild)"
    else
      echo "* indexing $name"
      jobname=ensref_${ORGANISM}_${RELEASE}_$name
      cmd="#!/bin/bash
date
which STARshort
mkdir $name
cd $name
ln -s ../$fa index.fa
ln -s ../$gtf index.gtf
STARshort --runMode genomeGenerate --genomeDir ./ --genomeFastaFiles index.fa \
  --runThreadN 12 --sjdbGTFfile index.gtf --sjdbOverhang 100 \
  --limitGenomeGenerateRAM 124544990592
echo done
  "
      echo "$cmd" > log/${name}.sh
      chmod u+x log/${name}.sh
      sbatch -p dontuseme -N 1 -o log/${name}.log -J $jobname -n 12 --mem=150GB log/${name}.sh
    fi

#    #######
#    ### hisat
#    #######
#  
#    name=${label_fa}_hisat
#    if [[ -d $name ]]
#    then
#      echo "* indexing $name: skipped (remove folder to rebuild)"
#    else
#      echo "* indexing $name"
#      jobname=ensref_${ORGANISM}_${RELEASE}_$name
#      cmd="#!/bin/bash
#date
#which hisat
#mkdir $name
#cd $name
#ln -s ../$fa index.fa
#hisat-build index.fa index.fa
#echo done
#"
#  #bwa index -a bwtsw -p $TARGET/$name/${fa%.fa} $fa
#      echo "$cmd" > log/${name}.sh
#      chmod u+x log/${name}.sh
#      sbatch -p dontuseme -o log/${name}.log -J $jobname log/${name}.sh
#    fi


    #######
    ### hisat2
    #######
  
    name=${label_fa}_hisat2
    if [[ -d $name ]]
    then
      echo "* indexing $name: skipped (remove folder to rebuild)"
    else
      echo "* indexing $name"
      jobname=ensref_${ORGANISM}_${RELEASE}_$name
      cmd="#!/bin/bash
date
module load hisat
which hisat2
mkdir $name
cd $name
ln -s ../$fa index.fa
hisat2-build index.fa index.fa
echo done
"
  #bwa index -a bwtsw -p $TARGET/$name/${fa%.fa} $fa
      echo "$cmd" > log/${name}.sh
      chmod u+x log/${name}.sh
      sbatch -p dontuseme -o log/${name}.log -J $jobname log/${name}.sh
    fi
  
    #######
    ### tophat2_cuffcompare
    #######
  
    # NOTE:
    # cuffcompare error 'Warning: could not parse ID or Parent from GFF line'
    # can be ignored/bypassed with dropping 'gene' features from the gtf file
    # e.g. 
    # awk -F '\t' '{if ($3 != "gene") print $0}' $gtf > ${gtf}.tmp
    # see the question
    # https://www.biostars.org/p/126423/
    # and solution
    # https://groups.google.com/forum/#!msg/tuxedo-tools-users/FTKA4qozJIc/p47AwnCXxvwJ

    name=${label_fa}_tophat2_cuffcompare
    if [[ -d $name ]]
    then
      echo "* indexing $name: skipped (remove folder to rebuild)"
    else
      echo "* indexing $name"
      jobname=ensref_${ORGANISM}_${RELEASE}_$name
      cmd="#!/bin/bash
date
which cuffcompare
which tophat2
mkdir $name
cuffcompare -V -CG -s chromosomes -r $gtf $gtf 2>&1
mv cuffcmp.combined.gtf cuffcompare.gtf
tar -jcvf cuffcompare.results.tar.bz2 cuffcmp.* --remove-files
tophat2 -o ${name}.tmp -G cuffcompare.gtf --transcriptome-index ${name}/index \
  ${label_fa}_bowtie2/index.fa 2>&1
rm -r ${name}.tmp
cd $name
for f in *
do
  [[ \$f =~ ^index\.fa ]] && continue
  mv \$f \${f/#index/index.fa}
done
echo done
"
  #bwa index -a bwtsw -p $TARGET/$name/${fa%.fa} $fa
      echo "$cmd" > log/${name}.sh
      chmod u+x log/${name}.sh
      if [[ -n $pid_bowtie2 ]]
      then
        sbatch -p dontuseme -o log/${name}.log -J $jobname -d afterok:$pid_bowtie2 log/${name}.sh
      else
        sbatch -p dontuseme -o log/${name}.log -J $jobname log/${name}.sh
      fi
    fi
  
    #######
    ### tophat2
    #######

    name=${label_fa}_tophat2
    if [[ -d $name ]]
    then
      echo "* indexing $name: skipped (remove folder to rebuild)"
    else
      echo "* indexing $name"
      jobname=ensref_${ORGANISM}_${RELEASE}_$name
      cmd="#!/bin/bash
date
which bowtie2
which tophat2
tophat2 -G $gtf -o $name --transcriptome-index $name/index ${label_fa}_bowtie2/index.fa 2>&1
cd $name
for f in *
do
  [[ \$f =~ ^index\.fa ]] && continue
  [[ -d \$f ]] && continue
  mv \$f \${f/#index/index.fa}
done
echo done
  "
      echo "$cmd" > log/${name}.sh
      chmod u+x log/${name}.sh
      if [[ -n $pid_bowtie2 ]]
      then
        sbatch -p dontuseme -o log/${name}.log -J $jobname -d afterok:$pid_bowtie2 log/${name}.sh
      else
        sbatch -p dontuseme -o log/${name}.log -J $jobname log/${name}.sh
      fi
    fi


  done
done

### done

echo "* done"