Merge pull request #645 from SciLifeLab/dev

Preparing release 2.2.0
SciLifeLab · Sep 21, 2018 · e504b0e · e504b0e
2 parents 0c0b89c + 2038a57
commit e504b0e
Show file tree

Hide file tree

Showing 42 changed files with 1,120 additions and 777 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -31,5 +31,8 @@ A clear and concise description of what you expected to happen.
 **Container (please complete the following information):**
  - tag: [e.g. 1.0.0]
 
+**Sarek (please complete the following information):**
+ - version: [e.g. 2.1.0]
+
 **Additional context**
 Add any other context about the problem here.
diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md
@@ -1,23 +1,34 @@
 # Release checklist
-This checklist is for our own reference
-
-1. Check that everything is up to date and ready to go
-    - Travis tests are passing
-    - Manual tests on Bianca are passing
-2. Increase version numbers
-3. Update version numbers in code: `configuration/base.config`
-4. Build, and get the containers.
-    - `./scripts/do_all.sh --push --tag <VERSION>`
-    - `./scripts/do_all.sh --pull --tag <VERSION>`
-5. Test against sample data.
-    - Check for any command line errors
-    - Check version numbers are printed correctly
-    - `./scripts/test.sh -p docker --tag <VERSION>`
-    - `./scripts/test.sh -p singularity --tag <VERSION>`
-    - `./scripts/test.sh -p singularityPath --tag <VERSION>`
-6. Commit and push version updates
-7. Make a [release](https://github.com/SciLifeLab/Sarek/releases) on GitHub
-8. Choose an appropriate codename for the release
-9. Tweet that new version is released
-10. Commit and push. Continue making more awesome :metal:
-11. Have fika :cake:
+
+> This checklist is for our own reference, to help us prepare a new release
+
+1.  Check that everything is ready to go
+
+    -   [PRs](https://github.com/SciLifeLab/Sarek/pulls) are merged
+    -   [Travis tests](https://travis-ci.org/SciLifeLab/Sarek/branches) are passing on `dev`
+
+2.  Increase version number following [semantic versioning](http://semver.org/spec/v2.0.0.html)
+3.  Choose an appropriate codename for the release
+    -   i.e. Peaks in [Sarek National Park](https://en.wikipedia.org/wiki/Sarek_National_Park#Topography)
+4.  Build docker containers.
+
+    -   `./scripts/do_all.sh --tag <VERSION>`
+
+5.  Test against sample data.
+
+    -   `./scripts/test.sh -p docker --tag <VERSION>`
+    -   Check for any command line errors
+
+6.  Use script to update version in files:
+
+    -   `./scripts/do_release.sh -r "<VERSION>" -c "<CODENAME>"`
+
+7.  Push latest updates
+8.  Make a PR against `dev`
+9.  Merge said PR
+10. Make a [release](https://github.com/SciLifeLab/Sarek/releases) on GitHub
+11. Update [bio.tools](https://bio.tools/Sarek) with the new release details
+12. Tweet that a new version is released
+13. Add a new `Unreleased` section in `CHANGELOG.md` for the `dev` version
+14. Commit and push. Continue making more awesome :metal:
+15. Have fika :cake:
diff --git a/.travis.yml b/.travis.yml
@@ -11,19 +11,11 @@ env:
   global:
     - NXF_VER=0.31.0 SGT_VER=2.5.1
   matrix:
-    - CE=singularity TEST=TOOLS
-    - CE=singularity TEST=MANTA
-    - CE=docker      TEST=MANTA
-    - CE=docker      TEST=TOOLS
+    - CE=docker      TEST=SOMATIC
     - CE=docker      TEST=ANNOTATEVEP
-    - CE=singularity TEST=ANNOTATESNPEFF
-    - CE=singularity TEST=STEP
-    - CE=singularity TEST=GERMLINE
-    - CE=singularity TEST=DIR
     - CE=docker      TEST=ANNOTATESNPEFF
-    - CE=docker      TEST=STEP
     - CE=docker      TEST=GERMLINE
-    - CE=docker      TEST=DIR
+
 
 install:
   # Install Nextflow (and Singularity if needed)

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/Dockerfile b/Dockerfile
@@ -6,5 +6,5 @@ LABEL \
 	maintainer="Maxime Garcia <[email protected]>, Szilveszter Juhos <[email protected]>"
 
 COPY environment.yml /
-RUN conda env update -n root -f /environment.yml && conda clean -a
-ENV PATH /opt/conda/bin:$PATH
+RUN conda env create -f /environment.yml && conda clean -a
+ENV PATH /opt/conda/envs/sarek-2.2.0/bin:$PATH
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://opensource.scilifelab.se/projects/sarek/)
+# [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://sarek.scilifelab.se/)
 
-#### An open-source analysis pipeline to detect germline or somatic variants from whole genome sequencing
+#### An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing
 
 [![Nextflow version][nextflow-badge]][nextflow-link]
 [![Travis build status][travis-badge]][travis-link]
@@ -21,7 +21,7 @@ Previously known as the Cancer Analysis Workflow (CAW),
 Sarek is a workflow designed to run analyses on WGS data from regular samples or tumour / normal pairs, including relapse samples if required.
 
 It's built using [Nextflow][nextflow-link], a domain specific language for workflow building.
-Software dependencies are handled using [Docker](https://www.docker.com) or [Singularity](http://singularity.lbl.gov) - container technologies that provide excellent reproducibility and ease of use.
+Software dependencies are handled using [Docker](https://www.docker.com) or [Singularity](https://www.sylabs.io/singularity/) - container technologies that provide excellent reproducibility and ease of use.
 Singularity has been designed specifically for high-performance computing environments.
 This means that although Sarek has been primarily designed for use with the Swedish [UPPMAX HPC systems](https://www.uppmax.uu.se), it should be able to run on any system that supports these two tools.
 
@@ -82,12 +82,13 @@ The Sarek pipeline comes with documentation in the `docs/` directory:
 06. [Configuration and profiles documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/CONFIG.md)
 07. [Intervals documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/INTERVALS.md)
 08. [Running the pipeline](https://github.com/SciLifeLab/Sarek/blob/master/docs/USAGE.md)
-09. [Examples](https://github.com/SciLifeLab/Sarek/blob/master/docs/USE_CASES.md)
-10. [TSV file documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/TSV.md)
-11. [Processes documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/PROCESS.md)
-12. [Documentation about containers](https://github.com/SciLifeLab/Sarek/blob/master/docs/CONTAINERS.md)
-13. [More information about ASCAT](https://github.com/SciLifeLab/Sarek/blob/master/docs/ASCAT.md)
-14. [Output documentation structure](https://github.com/SciLifeLab/Sarek/blob/master/docs/OUTPUT.md)
+09. [Command line parameters](https://github.com/SciLifeLab/Sarek/blob/master/docs/PARAMETERS.md)
+10. [Examples](https://github.com/SciLifeLab/Sarek/blob/master/docs/USE_CASES.md)
+11. [Input files documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/INPUT.md)
+12. [Processes documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/PROCESS.md)
+13. [Documentation about containers](https://github.com/SciLifeLab/Sarek/blob/master/docs/CONTAINERS.md)
+14. [More information about ASCAT](https://github.com/SciLifeLab/Sarek/blob/master/docs/ASCAT.md)
+15. [Output documentation structure](https://github.com/SciLifeLab/Sarek/blob/master/docs/OUTPUT.md)
 
 ## Contributions & Support
 

diff --git a/Sarek-data b/Sarek-data
diff --git a/Singularity b/Singularity
@@ -7,7 +7,7 @@ Bootstrap:docker
     VERSION 2.1.0
 
 %environment
-    PATH=/opt/conda/envs/sarek-2.1.0/bin:$PATH
+    PATH=/opt/conda/envs/sarek-2.2.0/bin:$PATH
     export PATH
 
 %files

diff --git a/annotate.nf b/annotate.nf
@@ -26,9 +26,13 @@ kate: syntax groovy; space-indent on; indent-width 2;
  https://github.com/SciLifeLab/Sarek/README.md
 --------------------------------------------------------------------------------
  Processes overview
- - RunBcftoolsStats - Run BCFTools stats on vcf before annotation
+ - RunBcftoolsStats - Run BCFTools stats on vcf files
+ - RunVcftools - Run VCFTools on vcf files
  - RunSnpeff - Run snpEff for annotation of vcf files
  - RunVEP - Run VEP for annotation of vcf files
+ - CompressVCF - Compress and index vcf files using tabix
+ - GetVersionSnpeff - Get version of tools
+ - GetVersionVEP - Get version of tools
 ================================================================================
 =                           C O N F I G U R A T I O N                          =
 ================================================================================
@@ -89,8 +93,6 @@ vcfNotToAnnotate.close()
 
 // as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any
 
-
-
 (vcfForBCFtools, vcfForVCFtools, vcfForSnpeff, vcfForVep) = vcfToAnnotate.into(4)
 
 vcfForVep = vcfForVep.map {
@@ -214,7 +216,7 @@ process RunVEP {
   finalannotator = annotator == "snpeff" ? 'merge' : 'vep'
   genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome
   """
-  vep --dir /opt/vep/.vep/ \
+  /opt/vep/src/ensembl-vep/vep --dir /opt/vep/.vep/  \
   -i ${vcf} \
   -o ${vcf.simpleName}_VEP.ann.vcf \
   --assembly ${genome} \
@@ -265,7 +267,7 @@ if (params.verbose) vcfCompressedoutput = vcfCompressedoutput.view {
   "Index : ${it[3].fileName}"
 }
 
-process GetVersionSnpEFF {
+process GetVersionSnpeff {
   publishDir directoryMap.version, mode: 'link'
   output: file("v_*.txt")
   when: 'snpeff' in tools || 'merge' in tools

diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# this script concatenates all VCFs that are in the local directory: the 
+# purpose is to make a single VCF from all the VCFs that were created from different intervals
+
+usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
+
+while getopts "i:c:o:t:" p; do
+	case "${p}" in
+		i)
+			genomeIndex=${OPTARG}
+			;;
+		c)
+			cpus=${OPTARG}
+			;;
+		o)
+			outputFile=${OPTARG}
+			;;
+		t)
+			targetBED=${OPTARG}
+			;;
+		*)
+			usage
+			;;
+	esac
+done
+shift $((OPTIND-1))
+
+if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
+if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
+if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi
+
+set -euo pipefail
+
+# first make a header from one of the VCF intervals
+# get rid of interval information only from the GATK command-line, but leave the rest
+FIRSTVCF=$(ls *.vcf | head -n 1)
+sed -n '/^[^#]/q;p' $FIRSTVCF | \
+awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
+> header
+
+# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
+# header in the VCF as it is optional (FreeBayes does not save it, for example)
+CONTIGS=($(cut -f1 ${genomeIndex}))
+
+# concatenate VCFs in the correct order
+(
+  cat header
+
+  for chr in "${CONTIGS[@]}"; do
+    # Skip if globbing would not match any file to avoid errors such as
+    # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
+    # was not processed.
+    pattern="${chr}_*.vcf"
+    if ! compgen -G "${pattern}" > /dev/null; then continue; fi
+
+    # ls -v sorts by numeric value ("version"), which means that chr1_100_
+    # is sorted *after* chr1_99_.
+    for vcf in $(ls -v ${pattern}); do
+      # Determine length of header.
+      # The 'q' command makes sed exit when it sees the first non-header
+      # line, which avoids reading in the entire file.
+      L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
+
+      # Then print all non-header lines. Since tail is very fast (nearly as
+      # fast as cat), this is way more efficient than using a single sed,
+      # awk or grep command.
+      tail -n +$((L+1)) ${vcf}
+    done
+  done
+) | bgzip -@${cpus} > rawcalls.vcf.gz
+tabix rawcalls.vcf.gz
+
+set +u
+
+# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
+echo "target is $targetBED"
+if [ ! -z ${targetBED+x} ]; then
+	echo "Selecting subset..."
+	bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz
+	tabix ${outputFile}.gz
+else
+	# simply rename the raw calls as WGS results
+	for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done
+fi
+
diff --git a/buildContainers.nf b/buildContainers.nf
@@ -98,7 +98,7 @@ process PullSingularityContainers {
 
   script:
   """
-  singularity pull --name ${container}-${params.tag}.img docker://${params.repository}/${container}:${params.tag}
+  singularity build ${container}-${params.tag}.simg docker://${params.repository}/${container}:${params.tag}
   """
 }
 

diff --git a/buildReferences.nf b/buildReferences.nf
@@ -26,7 +26,6 @@ kate: syntax groovy; space-indent on; indent-width 2;
  https://github.com/SciLifeLab/Sarek/README.md
 --------------------------------------------------------------------------------
  Processes overview
- - ProcessReference - Download all references if needed
  - DecompressFile - Extract files if needed
  - BuildBWAindexes - Build indexes for BWA
  - BuildReferenceIndex - Build index for FASTA refs
@@ -144,7 +143,7 @@ process BuildReferenceIndex {
 }
 
 if (params.verbose) ch_referenceIndex.view {
-  "Reference index        : ${it.fileName}"
+  "Reference index     : ${it.fileName}"
 }
 
 process BuildSAMToolsIndex {

diff --git a/conf/base.config b/conf/base.config
@@ -33,9 +33,10 @@ params {
   step = 'mapping' // Default step is mapping
   strelkaBP = false // Don't use Manta's candidate indels as input to Strelka
   tag = 'latest' // Default tag is latest, to be overwritten by --tag <version>
+  targetBED = false // no targets by default
   test = false // Not testing by default
   verbose = false // Enable for more verbose information
-  version = '2.1.0' // Workflow version
+  version = '2.2.0' // Workflow version
 }
 
 process {

diff --git a/conf/containers.config b/conf/containers.config
@@ -38,36 +38,9 @@ process {
   withName:GetVersionASCAT {
     container = "${params.repository}/r-base:${params.tag}"
   }
-  withName:GetVersionBamQC {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionBCFtools {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionBWAsamtools {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionFastQC {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionFreeBayes {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionGATK {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionManta {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
   withName:GetVersionSnpeff {
     container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"}
   }
-  withName:GetVersionStrelka {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
-  withName:GetVersionVCFtools {
-    container = "${params.repository}/sarek:${params.tag}"
-  }
   withName:GetVersionVEP {
     container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"}
   }