8.1. Introduction

The goal is to analyse 1 SARS-CoV-2 sequencing dataset, in order to 1) infer the full sequence of the virus, and 2) Detect the clade (alpha, beta, etc.). To do so we will start from a sample that has been sequenced on an Illumina sequencer, and we will run the following steps:

  1. Mapping the reads on a reference genome

  2. SNP Calling

  3. Building consensus sequence

  4. Detect clade

You should build a workflow that looks like:

../_images/wf.png

8.2. Input data

Input data can be downloaded here .

It consists of :

8.3. Detailed steps

8.3.1. Read mapping

  • Read mapping will be performed using bwa mem .

  • Command lines:

    • Indexing the reference genome:
      bwa index reference.fa
      
    • Mapping the reads:
      bwa mem -t 1 reference.fa reads1.fq reads2.fq > tmp.sam
      
    • Converting sam file to bam file:
      samtools sort -o sample.bam tmp.sam
      samtools index sample.bam
      

8.3.2. SNP Calling

  • SNP Calling will be performed with iVAR .

  • Command lines:
    samtools mpileup -A -d 600000 --reference reference.fasta -B -Q 0 sample.bam | ivar variants -r reference.fa -p sample.variants -q 0 -t 0.02
    

8.3.3. Consensus sequence

  • Consensus sequence will be inferred also using iVAR

  • Command lines :
    samtools mpileup -d 600000 -A -Q 0 -F 0 sample.bam | ivar consensus -q 20 -t 0 -m 5 -n N -p sample_consensus
    

8.3.4. Detecting clade (NextClade)

  • Clade detection is performed by Nextclade .

  • You will also need a tree file and a config file as input files.

  • The command line looks like::
    nextclade -i 'sample_consensus.fa' -j 1 -t 'NextClade_annotations_ivar.tsv' -o 'NextClade_annotations_ivar.json'
    

8.3.5. Detecting clade (Pangolin)

  • Clade detection is performed by Pangolin .

  • Command lines :
    PATH=/opt/conda/envs/pangolin/bin/:\$PATH
    pangolin 'sample_consensus.fa' -t 20 --outfile Pangolin_lineage_report.csv
    

8.3.5.1. List of tools needed

  • samtools (evolbioinfo/samtools:v1.11);

  • iVAR (evolbioinfo/ivar:v1.3.1);

  • Nextclade (neherlab/nextclade:0.14.4);

  • Pangolin (evolbioinfo/pangolin:v3.1.4);

  • bwa (evolbioinfo/bwa:v0.7.17).

8.3.5.2. Proposition of implementation

8.3.5.2.1. Nextflow

nextflow run main.nf --reference "fasta/nCov_ampliseq_up.fasta" --fastq "fastq_files" --resultdir "results"

main.nf

params.reference = "fasta/nCov_ampliseq_up.fasta"
params.fastq = "fastq_files/"
params.resultdir = "results"

referenceFile = file(params.reference)
data = Channel.fromFilePairs(params.fastq+'/*_{1,2}.fq.gz', flat:true)
resultdir=file(params.resultdir)


process indexRef {
    input:
    file ref from referenceFile

    output:
    tuple val(ref.name), file("${ref.baseName}.*") into index

    script:
    """
    bwa index ${ref}
    """
    }

process mapping {
    publishDir "${resultdir}/bams/", mode:'copy'

    input: 
    tuple val(name), file(f1), file(f2) from data
    tuple val(refName), file(ref) from index.first()

    output:
    tuple val(name), file("*.bam"), file("*.bai") into bam
    
    script:
    """
    bwa mem -t ${task.cpus} ${refName} ${f1} ${f2} > tmp.sam  
    samtools sort -o ${name}.bam tmp.sam
    samtools index ${name}.bam
    """
    }

process consensus {
    publishDir "${resultdir}/consensus/", mode: 'copy'
    
    input:
    tuple val(name), file(bam), file(bai) from bam

    output:
    file "${name}.fa" into consensus_channel

    script:
    """
    samtools mpileup -d 600000 -A -Q 0 -F 0 ${bam} | ivar consensus -q 20 -t 0 -m 5 -n N -p ${name}
    """
}

consensus_channel.collectFile(name:"all_consensus.fasta").into{consensus_nextclade;consensus_pangolin}

process nextclade {
    publishDir "${resultdir}/", mode: 'copy'

    input:
    file fa from consensus_nextclade

    output:
    file "*_ivar.*" into nextclade_channel

    script:
    """
    nextclade -i ${fa} -j ${task.cpus} -t 'NextClade_annotations_ivar.tsv' -o 'NextClade_annotations_ivar.json'
    """
}



process pangolin {
    publishDir "${resultdir}/", mode:'copy'

    input:
    file fa from consensus_pangolin

    output:
    file "*.csv" into pangolin_channel

    script:
    """
    PATH=/opt/conda/envs/pangolin/bin/:\$PATH
    pangolin ${fa} -t ${task.cpus} --outfile Pangolin_lineage_report.csv
    """
}

nextflow.config

singularity {
	enabled = true
	autoMounts = true
	runOptions = '--home $HOME:/home/$USER --bind /pasteur'
}

process {
	executor='local'
	cpus=1
       	memory="3G"
	publishDir="results/"	
	withName: 'indexRef'  {
		container = "evolbioinfo/bwa:v0.7.17"
		cpus=1
	    memory="3G"
	}
	withName: 'mapping' { 
		container = "evolbioinfo/bwa:v0.7.17"
		cpus=1
    	memory="3G"
	}
	withName: 'consensus' {
		container = "evolbioinfo/ivar:v1.3.1"
		cpus=1
	    memory="3G"
	}
	withName: 'nextclade' {
		container = "neherlab/nextclade:0.14.4"
		cpus=1
	    memory="3G"
	}
	withName: 'pangolin' {
		container = "evolbioinfo/pangolin:v3.1.4"
		cpus=1
	    memory="3G"
	}
}

report {
    enabled = true
    file = 'reports/report.html'
}

trace {
    enabled = true
    file = 'reports/trace.txt'
}

timeline {
    enabled = true
    file = 'reports/timeline.html'
}

dag {
    enabled = true
    file = 'reports/dag.dot'
}

nextflow.config (pour maestro)

singularity {
	enabled = true
	autoMounts = true
	runOptions = '--home $HOME:/home/$USER --bind /pasteur'
}

executor {
    name = 'slurm'
    queueSize = 2000
}

process {
	executor='slurm'
	queue = "common,dedicated"
    clusterOptions = "--qos=fast"

    scratch=false
    maxRetries=30
    errorStrategy='retry'

	publishDir="results/"	
	withName: 'indexRef'  {
		container = "evolbioinfo/bwa:v0.7.17"
		cpus=1
	    memory="3G"
	}
	withName: 'mapping' { 
		container = "evolbioinfo/bwa:v0.7.17"
		cpus=10
    	memory="3G"
	}
	withName: 'consensus' {
		container = "evolbioinfo/ivar:v1.3.1"
		cpus=1
	    memory="3G"
	}
	withName: 'nextclade' {
		container = "neherlab/nextclade:0.14.4"
		cpus=10
	    memory="3G"
	}
	withName: 'pangolin' {
		container = "evolbioinfo/pangolin:v3.1.4"
		cpus=10
	    memory="3G"
	}
}

report {
    enabled = true
    file = 'reports/report.html'
}

trace {
    enabled = true
    file = 'reports/trace.txt'
}

timeline {
    enabled = true
    file = 'reports/timeline.html'
}

dag {
    enabled = true
    file = 'reports/dag.dot'
}

8.3.5.2.2. Snakemake

snakemake --snakefile Snakefile -j 1 --use-singularity

Snakefile

samples = ['A', 'B', 'C', 'D']



# wget https://github.com/nextstrain/nextclade/raw/master/data/sars-cov-2/tree.json
# wget https://raw.githubusercontent.com/nextstrain/nextclade/master/data/sars-cov-2/qc.json

rule all:
    input:         
        "nexclade.json",
        expand("{sample}.variants.tsv", sample=samples),
        "pangolin.json"

rule index:
    input: "covid.fa"
    output: "covid.fa.bwt"
    container: "docker://evolbioinfo/bwa:v0.7.17"
    shell:
        """
        bwa index {input}
        """


rule mapping:
    input:
        f1= "{sample}_1.fq.gz",
        f2= "{sample}_2.fq.gz",
        ref= "covid.fa",
        indexf= "covid.fa.bwt"
    output: "{sample}.sam"
    container: "docker://evolbioinfo/bwa:v0.7.17"
    shell:
        """
        bwa mem -t 1 {input.ref} {input.f1} {input.f2} > {output}
        """

rule sort:
    input: "{sample}.sam"
    output:
        bam="{sample}.bam",
        bai="{sample}.bam.bai"
    container: "docker://evolbioinfo/samtools:v1.11"
    shell:
        """
        samtools sort -o {output.bam} {input}
        samtools index {output.bam}
        """

rule pileup:
    input:
        bam="{sample}.bam",
        bai="{sample}.bam.bai",
        ref="covid.fa"
    output: "{sample}.variants.tsv"
    container: "docker://evolbioinfo/ivar:v1.3.1"
    shell:
        """
        samtools mpileup -A -d 600000 --reference {input.ref} -B -Q  0 {input.bam} |  ivar variants -r {input.ref} -p {wildcards.sample}.variants -q 0 -t 0.02 
        """

rule consensus:
    input:
        bam="{sample}.bam",
        bai="{sample}.bam.bai",
    output: 
        fasta="{sample}_consensus.fa",
        qual="{sample}_consensus.qual.txt",
    container: "docker://evolbioinfo/ivar:v1.3.1"
    shell:
        """
        samtools mpileup -d 600000 -A -Q 0 -F 0 {input.bam} | ivar consensus -q 20 -t 0 -m 5 -n N -p {wildcards.sample}_consensus

        """

rule sequences:
    input: expand("{sample}_consensus.fa", sample=samples)
    output: "sequences.fa"
    shell:
        """
        cat {input} > {output}
        """

rule nextclade:
    input: 
        consensus="sequences.fa",
        ref="covid.fa",
        qc="qc.json"
    output: 
        "nexclade.json"
    container: "docker://neherlab/nextclade:0.14.4"
    shell:
        """
        nextclade --input-fasta {input.consensus} -j 1  --input-root-seq={input.ref} --verbose -o {output}
        """

rule pangolin:
    input: 
        consensus="sequences.fa",
        ref="covid.fa",
        genemap="genemap.gff",
    output: 
        "pangolin.json"
    container: "docker://evolbioinfo/pangolin:v3.1.4"
    shell:
        """
        PATH=/opt/conda/envs/pangolin/bin/:\$PATH
        pangolin  {input.consensus}  -t 20 --outfile Pangolin_lineage_report.csv
        """