1 Fastq-dump - loading FASTQ files from SRA database
2 Trimmomatic - read preprocessing \(\bullet\) generate different read length
1. trim adapters
2. sliding window trimming
3. remove reads shorter than 100 nt
4. trim reads to 100, 75, 50 and 36 nt
read_length=(100 75 50 36)
for length in ${read_length[@]}; do
for SRR in `ls -1 *_1.fastq.gz | sed 's/\_1.fastq.gz//'`; do # go through all read pairs
echo trimmomatic PE $SRR\_1.fastq.gz $SRR\_2.fastq.gz \ # pass paired forward and reverse reads
$length\_$SRR\_R1_paired.fastq.gz $length\_$SRR\_R1_unpaired.fastq.gz \ # define 4 output files
$length\_$SRR\_R2_paired.fastq.gz $length\_$SRR\_R2_unpaired.fastq.gz \
ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:8:true \ # cut adapters
SLIDINGWINDOW:4:20 \ # sliding window trimming
MINLEN:100 \ # remove reads < 100 nt
CROP:$length >> /processed_reads/endodermal_cell/cmd_file_$length
done
bash /processed_reads/endodermal_cell/cmd_file_$length
done4 STAR - read mapping
read_length=(100 75 50 36)
gtf=gencode.v41.annotation.gtf
genome=GRCh38.p13.genome.fa
gd=genome_indices
# GENERATING GENOME INDEXES for all read length
for length in ${read_length[@]}; do
STAR \
--runThreadN 6 \
--runMode genomeGenerate \ # generate genome files
--genomeDir $gd/$length \ # path to previously created directory with genome index files
--genomeFastaFiles $genome \ # FASTA files with genome reference sequences
--sjdbGTFfile $gtf \ # path to file with annotated transcripts
--sjdbOverhang $length-1 # specify max possible overhang for the reads
done
# READ MAPPING
for length in ${read_length[@]}; do
for SRR in `ls -1 $length\_*_R1_paired.fastq.gz | sed 's/\_R1_paired.fastq.gz//'`; do
STAR \
--runThreadN 6 \
--genomeDir $gd/$length \
--readFilesCommand zcat \ # unzip gzipped files
--readFilesIn $SRR\_R1_paired.fastq.gz $SRR\_R2_paired.fastq.gz \ # path to forward and reverse read
--outFileNamePrefix star_mapping/U2AF2/$SRR\_ \ # change file prefixes
--outFilterMultimapNmax 1 \ # max number of multiple alignments allowed for a read
--outFilterMismatchNmax 999 \ # max number of mismatches per pair --> switched off
--outFilterMismatchNoverLmax 0.04 \ # max number of mismatches per pair relative to read length
--outSAMtype BAM SortedByCoordinate # set output type to BAM sorted by coordinate
done
done5 Samtools - subsampling bam files \(\bullet\) generate different read depths
# shrink read depth to 30M, 25M, 20M, 15M, 10M, 2.5M, 1M reads
# ~ 30M reads
samtools view -b -s 0.96323430963552176957701491760975332532564543923003 U2AF2_100_CT_1.bam > read_depth/U2AF2_CT_1_30M_reads.bam6 MAJIQ
6.1
majiq build - LSV detection 🔎
Configuration file
To run majiq build, a configuration file is needed that
contains the bam file paths and information about read length,
strandness, the reference genome and the grouping of the
experiments.
The config file was created for 100, 75, 50 and 36 nt read length and for each of the 1 - 30 million reads read depth.
[info]
readlen=100
bamdirs=star_mapping/U2AF2/100/read_length
genome=GRCh38
strandness=reverse
[experiments]
U2AF2_100_KD=U2AF2_100_KD_1,U2AF2_100_KD_2
U2AF2_100_CT=U2AF2_100_CT_1,U2AF2_100_CT_2
LSV detection for different read length:
# FILES AND DIRS
GFF3="gencode.v41.annotation.gff3"
CONFIGFILE="majiq/build"
OUTDIR="majiq/build"
genes=(U2AF2 UPF1)
read_length=(100 75 50 36)
for gene in ${genes[@]}; do
for length in ${read_length[@]}; do
majiq build $GFF3 \
-c $CONFIGFILE/$gene/read_length/$length/${gene}_${length}_config_file_.txt \
-j 6 \
-o $OUTDIR/$gene/read_length/$length
done
doneLSV detection for different read depth:
# FILES AND DIRS
GFF3="gencode.v41.annotation.gff3"
CONFIGFILE="majiq/build"
OUTDIR="majiq/build"
reads=(30M 25M 20M 15M 10M 5M 2.5M 1M)
for reads in ${reads[@]}; do
majiq build $GFF3 \
-c $CONFIGFILE/U2AF2/read_depth/$reads/U2AF2_${reads}_reads_config_file.txt \
-j 6 \
-o $OUTDIR/U2AF2/read_depth/$reads
done 6.2
majiq deltapsi - LSV quantification \(\Delta\Psi\)
# FILES AND DIRS
BUILDDIR="majiq/build"
OUTDIR="majiq/dpsi"
genes=(U2AF2 UPF1)
read_length=(100 75 50 36)
for gene in ${genes[@]}; do
for length in ${read_length[@]}; do
majiq deltapsi \
-grp1 \
"${BUILDDIR}/$gene/read_length/$length/${gene}_${length}_KD_1.majiq" \
"${BUILDDIR}/$gene/read_length/$length/${gene}_${length}_KD_2.majiq" \
-grp2 \
"${BUILDDIR}/$gene/read_length/$length/${gene}_${length}_CT_1.majiq" \
"${BUILDDIR}/$gene/read_length/$length/${gene}_${length}_CT_2.majiq" \
--names $gene\_$length\_KD $gene\_$length\_CT \
-j 6 \
--output-type voila \
-o $OUTDIR/$gene/read_length/$length/
done
done 6.3
voila tsv - generate TSV files
# FILES AND DIRS
BUILDDIR="majiq/build/"
DPSIDIR="majiq/dpsi/"
OUTDIR="majiq/voila/"
genes=(U2AF2 UPF1)
readLength=(100 75 50 36)
# MAJIQ Voila
# U2AF2 and UPF1
for GENE in ${genes[@]}; do
for LENGTH in ${readLength[@]}; do
voila tsv \
-f "${OUTDIR}/${GENE}/${LENGTH}/${GENE}_${LENGTH}_KD_vs_CT_voila.tsv" \
--show-all \
--threshold 0.05 \
--changing-between-group-dpsi 0.05 \
--non-changing-between-group-dpsi 0.05 \
-j 6 \
"${BUILDDIR}/${GENE}/${LENGTH}/splicegraph.sql" \
"${DPSIDIR}/${GENE}/${LENGTH}/${GENE}_${LENGTH}_KD-${GENE}_${LENGTH}_CT.deltapsi.voila"
done
done 6.4
voila modulize - LSV classification
#FILES AND DIRS
BUILDDIR="majiq/build/read_depth"
DPSIDIR="majiq/dpsi/read_depth"
OUTDIR="majiq/modulize/read_depth"
read_depth=(30M 25M 20M 15M 10M 5M 2_5M 1M)
# voila modulize
for reads in ${read_depth[@]}; do
voila modulize \
--changing-between-group-dpsi 0.05 \
--non-changing-between-group-dpsi 0.05 \
--changing-between-group-dpsi-secondary 0.025 \
--show-all \
-j 6 \
-d "${OUTDIR}/${reads}" \
"${BUILDDIR}/${reads}/splicegraph.sql" \
"${DPSIDIR}/${reads}/KD_${reads}_reads-CT_${reads}_reads.deltapsi.voila"
done