8.1. Introduction¶
The goal is to analyse 1 SARS-CoV-2 sequencing dataset, in order to 1) infer the full sequence of the virus, and 2) Detect the clade (alpha, beta, etc.). To do so we will start from a sample that has been sequenced on an Illumina sequencer, and we will run the following steps:
Mapping the reads on a reference genome
SNP Calling
Building consensus sequence
Detect clade
You should build a workflow that looks like:

8.2. Input data¶
Input data can be downloaded here .
It consists of :
A compressed fastq file containing all the reads;
The reference genome to map the reads against (https://www.ncbi.nlm.nih.gov/nuccore/MN908947)
8.3. Detailed steps¶
8.3.1. Read mapping¶
Read mapping will be performed using bwa mem .
Command lines:
- Indexing the reference genome:
bwa index reference.fa
- Mapping the reads:
bwa mem -t 1 reference.fa reads1.fq reads2.fq > tmp.sam
- Converting sam file to bam file:
samtools sort -o sample.bam tmp.sam samtools index sample.bam
8.3.2. SNP Calling¶
SNP Calling will be performed with iVAR .
- Command lines:
samtools mpileup -A -d 600000 --reference reference.fasta -B -Q 0 sample.bam | ivar variants -r reference.fa -p sample.variants -q 0 -t 0.02
8.3.3. Consensus sequence¶
Consensus sequence will be inferred also using iVAR
- Command lines :
samtools mpileup -d 600000 -A -Q 0 -F 0 sample.bam | ivar consensus -q 20 -t 0 -m 5 -n N -p sample_consensus
8.3.4. Detecting clade (NextClade)¶
Clade detection is performed by Nextclade .
You will also need a tree file and a config file as input files.
- The command line looks like::
nextclade -i 'sample_consensus.fa' -j 1 -t 'NextClade_annotations_ivar.tsv' -o 'NextClade_annotations_ivar.json'
8.3.5. Detecting clade (Pangolin)¶
Clade detection is performed by Pangolin .
- Command lines :
PATH=/opt/conda/envs/pangolin/bin/:\$PATH pangolin 'sample_consensus.fa' -t 20 --outfile Pangolin_lineage_report.csv
8.3.5.1. List of tools needed¶
8.3.5.2. Proposition of implementation¶
8.3.5.2.1. Nextflow¶
nextflow run main.nf --reference "fasta/nCov_ampliseq_up.fasta" --fastq "fastq_files" --resultdir "results"
main.nf
params.reference = "fasta/nCov_ampliseq_up.fasta"
params.fastq = "fastq_files/"
params.resultdir = "results"
referenceFile = file(params.reference)
data = Channel.fromFilePairs(params.fastq+'/*_{1,2}.fq.gz', flat:true)
resultdir=file(params.resultdir)
process indexRef {
input:
file ref from referenceFile
output:
tuple val(ref.name), file("${ref.baseName}.*") into index
script:
"""
bwa index ${ref}
"""
}
process mapping {
publishDir "${resultdir}/bams/", mode:'copy'
input:
tuple val(name), file(f1), file(f2) from data
tuple val(refName), file(ref) from index.first()
output:
tuple val(name), file("*.bam"), file("*.bai") into bam
script:
"""
bwa mem -t ${task.cpus} ${refName} ${f1} ${f2} > tmp.sam
samtools sort -o ${name}.bam tmp.sam
samtools index ${name}.bam
"""
}
process consensus {
publishDir "${resultdir}/consensus/", mode: 'copy'
input:
tuple val(name), file(bam), file(bai) from bam
output:
file "${name}.fa" into consensus_channel
script:
"""
samtools mpileup -d 600000 -A -Q 0 -F 0 ${bam} | ivar consensus -q 20 -t 0 -m 5 -n N -p ${name}
"""
}
consensus_channel.collectFile(name:"all_consensus.fasta").into{consensus_nextclade;consensus_pangolin}
process nextclade {
publishDir "${resultdir}/", mode: 'copy'
input:
file fa from consensus_nextclade
output:
file "*_ivar.*" into nextclade_channel
script:
"""
nextclade -i ${fa} -j ${task.cpus} -t 'NextClade_annotations_ivar.tsv' -o 'NextClade_annotations_ivar.json'
"""
}
process pangolin {
publishDir "${resultdir}/", mode:'copy'
input:
file fa from consensus_pangolin
output:
file "*.csv" into pangolin_channel
script:
"""
PATH=/opt/conda/envs/pangolin/bin/:\$PATH
pangolin ${fa} -t ${task.cpus} --outfile Pangolin_lineage_report.csv
"""
}
nextflow.config
singularity {
enabled = true
autoMounts = true
runOptions = '--home $HOME:/home/$USER --bind /pasteur'
}
process {
executor='local'
cpus=1
memory="3G"
publishDir="results/"
withName: 'indexRef' {
container = "evolbioinfo/bwa:v0.7.17"
cpus=1
memory="3G"
}
withName: 'mapping' {
container = "evolbioinfo/bwa:v0.7.17"
cpus=1
memory="3G"
}
withName: 'consensus' {
container = "evolbioinfo/ivar:v1.3.1"
cpus=1
memory="3G"
}
withName: 'nextclade' {
container = "neherlab/nextclade:0.14.4"
cpus=1
memory="3G"
}
withName: 'pangolin' {
container = "evolbioinfo/pangolin:v3.1.4"
cpus=1
memory="3G"
}
}
report {
enabled = true
file = 'reports/report.html'
}
trace {
enabled = true
file = 'reports/trace.txt'
}
timeline {
enabled = true
file = 'reports/timeline.html'
}
dag {
enabled = true
file = 'reports/dag.dot'
}
nextflow.config (pour maestro)
singularity {
enabled = true
autoMounts = true
runOptions = '--home $HOME:/home/$USER --bind /pasteur'
}
executor {
name = 'slurm'
queueSize = 2000
}
process {
executor='slurm'
queue = "common,dedicated"
clusterOptions = "--qos=fast"
scratch=false
maxRetries=30
errorStrategy='retry'
publishDir="results/"
withName: 'indexRef' {
container = "evolbioinfo/bwa:v0.7.17"
cpus=1
memory="3G"
}
withName: 'mapping' {
container = "evolbioinfo/bwa:v0.7.17"
cpus=10
memory="3G"
}
withName: 'consensus' {
container = "evolbioinfo/ivar:v1.3.1"
cpus=1
memory="3G"
}
withName: 'nextclade' {
container = "neherlab/nextclade:0.14.4"
cpus=10
memory="3G"
}
withName: 'pangolin' {
container = "evolbioinfo/pangolin:v3.1.4"
cpus=10
memory="3G"
}
}
report {
enabled = true
file = 'reports/report.html'
}
trace {
enabled = true
file = 'reports/trace.txt'
}
timeline {
enabled = true
file = 'reports/timeline.html'
}
dag {
enabled = true
file = 'reports/dag.dot'
}
8.3.5.2.2. Snakemake¶
snakemake --snakefile Snakefile -j 1 --use-singularity
Snakefile
samples = ['A', 'B', 'C', 'D']
# wget https://github.com/nextstrain/nextclade/raw/master/data/sars-cov-2/tree.json
# wget https://raw.githubusercontent.com/nextstrain/nextclade/master/data/sars-cov-2/qc.json
rule all:
input:
"nexclade.json",
expand("{sample}.variants.tsv", sample=samples),
"pangolin.json"
rule index:
input: "covid.fa"
output: "covid.fa.bwt"
container: "docker://evolbioinfo/bwa:v0.7.17"
shell:
"""
bwa index {input}
"""
rule mapping:
input:
f1= "{sample}_1.fq.gz",
f2= "{sample}_2.fq.gz",
ref= "covid.fa",
indexf= "covid.fa.bwt"
output: "{sample}.sam"
container: "docker://evolbioinfo/bwa:v0.7.17"
shell:
"""
bwa mem -t 1 {input.ref} {input.f1} {input.f2} > {output}
"""
rule sort:
input: "{sample}.sam"
output:
bam="{sample}.bam",
bai="{sample}.bam.bai"
container: "docker://evolbioinfo/samtools:v1.11"
shell:
"""
samtools sort -o {output.bam} {input}
samtools index {output.bam}
"""
rule pileup:
input:
bam="{sample}.bam",
bai="{sample}.bam.bai",
ref="covid.fa"
output: "{sample}.variants.tsv"
container: "docker://evolbioinfo/ivar:v1.3.1"
shell:
"""
samtools mpileup -A -d 600000 --reference {input.ref} -B -Q 0 {input.bam} | ivar variants -r {input.ref} -p {wildcards.sample}.variants -q 0 -t 0.02
"""
rule consensus:
input:
bam="{sample}.bam",
bai="{sample}.bam.bai",
output:
fasta="{sample}_consensus.fa",
qual="{sample}_consensus.qual.txt",
container: "docker://evolbioinfo/ivar:v1.3.1"
shell:
"""
samtools mpileup -d 600000 -A -Q 0 -F 0 {input.bam} | ivar consensus -q 20 -t 0 -m 5 -n N -p {wildcards.sample}_consensus
"""
rule sequences:
input: expand("{sample}_consensus.fa", sample=samples)
output: "sequences.fa"
shell:
"""
cat {input} > {output}
"""
rule nextclade:
input:
consensus="sequences.fa",
ref="covid.fa",
qc="qc.json"
output:
"nexclade.json"
container: "docker://neherlab/nextclade:0.14.4"
shell:
"""
nextclade --input-fasta {input.consensus} -j 1 --input-root-seq={input.ref} --verbose -o {output}
"""
rule pangolin:
input:
consensus="sequences.fa",
ref="covid.fa",
genemap="genemap.gff",
output:
"pangolin.json"
container: "docker://evolbioinfo/pangolin:v3.1.4"
shell:
"""
PATH=/opt/conda/envs/pangolin/bin/:\$PATH
pangolin {input.consensus} -t 20 --outfile Pangolin_lineage_report.csv
"""