#!/bin/bash ## Job Name #SBATCH --job-name=ron-rosM ## Allocation Definition #SBATCH --account=coenv #SBATCH --partition=coenv ## Nodes #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=15-00:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --chdir=/gscratch/scrubbed/sr320/030521-ronrosM # Directories and programs bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" reads_dir="/gscratch/srlab/sr320/data/cg/" genome_folder="/gscratch/srlab/sr320/data/Cgig-genome/roslin_M/" source /gscratch/srlab/programs/scripts/paths.sh ${bismark_dir}/bismark_genome_preparation \ --verbose \ --parallel 28 \ --path_to_aligner ${bowtie2_dir} \ ${genome_folder} #/zr3644_11_R2.fastp-trim.20201206.fq.gz find ${reads_dir}*_R1.fastp-trim.20201202.fq.gz \ | xargs basename -s _R1.fastp-trim.20201202.fq.gz | xargs -I{} ${bismark_dir}/bismark \ --path_to_bowtie ${bowtie2_dir} \ -genome ${genome_folder} \ -p 8 \ -score_min L,0,-0.6 \ --non_directional \ -1 ${reads_dir}{}_R1.fastp-trim.20201202.fq.gz \ -2 ${reads_dir}{}_R2.fastp-trim.20201202.fq.gz \ find *.bam | \ xargs basename -s .bam | \ xargs -I{} ${bismark_dir}/deduplicate_bismark \ --bam \ --paired \ {}.bam ${bismark_dir}/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 28 \ --buffer_size 75% \ *deduplicated.bam # Bismark processing report ${bismark_dir}/bismark2report #Bismark summary report ${bismark_dir}/bismark2summary #run multiqc /gscratch/srlab/programs/anaconda3/bin/multiqc . # Sort files for methylkit and IGV find *deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} ${samtools} \ sort --threads 28 {}.bam \ -o {}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find *.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} ${samtools} \ index -@ 28 {}.sorted.bam find *deduplicated.bismark.cov.gz \ | xargs basename -s _bismark_bt2_pe.deduplicated.bismark.cov.gz \ | xargs -I{} ${bismark_dir}/coverage2cytosine \ --genome_folder ${genome_folder} \ -o {} \ --merge_CpG \ --zero_based \ {}_bismark_bt2_pe.deduplicated.bismark.cov.gz #creating bedgraphs post merge for f in *merged_CpG_evidence.cov do STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \ > "${STEM}"_10x.bedgraph done for f in *merged_CpG_evidence.cov do STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \ > "${STEM}"_5x.bedgraph done #creating tab files with raw count for glms for f in *merged_CpG_evidence.cov do STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4, $5, $6}}' \ > "${STEM}"_10x.tab done for f in *merged_CpG_evidence.cov do STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4, $5, $6}}' \ > "${STEM}"_5x.tab done
Tag Archives: Bismark
job-name=hw-bsnlP
#!/bin/bash ## Job Name #SBATCH --job-name=hw-bsnlP ## Allocation Definition #SBATCH --account=coenv #SBATCH --partition=coenv ## Nodes #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=10-00:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --chdir=/gscratch/scrubbed/sr320/021921-hw-bsnP # Directories and programs bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" reads_dir="/gscratch/srlab/sr320/data/cg/" genome_folder="/gscratch/srlab/sr320/data/Cgig-genome/Crassostrea_gigas.oyster_v9.dna_sm.toplevel/" source /gscratch/srlab/programs/scripts/paths.sh #${bismark_dir}/bismark_genome_preparation \ #--verbose \ #--parallel 28 \ #--path_to_aligner ${bowtie2_dir} \ #${genome_folder} #/zr3644_11_R2.fastp-trim.20201206.fq.gz find ${reads_dir}*_R1.fastp-trim.20201206.fq.gz \ | xargs basename -s _R1.fastp-trim.20201206.fq.gz | xargs -n 1 -P 6 -I{} ${bismark_dir}/bismark \ --path_to_bowtie ${bowtie2_dir} \ -genome ${genome_folder} \ -p 4 \ --non_directional \ -1 ${reads_dir}{}_R1.fastp-trim.20201206.fq.gz \ -2 ${reads_dir}{}_R2.fastp-trim.20201206.fq.gz \ find *.bam | \ xargs basename -s .bam | \ xargs -I{} ${bismark_dir}/deduplicate_bismark \ --bam \ --paired \ {}.bam ${bismark_dir}/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 14 \ --buffer_size 75% \ *deduplicated.bam # Bismark processing report ${bismark_dir}/bismark2report #Bismark summary report ${bismark_dir}/bismark2summary # Sort files for methylkit and IGV find *deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} ${samtools} \ sort --threads 28 {}.bam \ -o {}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find *.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} ${samtools} \ index -@ 28 {}.sorted.bam
job-name=hw-bs
#!/bin/bash ## Job Name #SBATCH --job-name=hw-bs ## Allocation Definition #SBATCH --account=srlab #SBATCH --partition=srlab ## Nodes #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=20-00:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --chdir=/gscratch/scrubbed/sr320/021321-hw-bs # Directories and programs bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" reads_dir="/gscratch/srlab/sr320/data/cg/" genome_folder="/gscratch/srlab/sr320/data/Cgig-genome/Crassostrea_gigas.oyster_v9.dna_sm.toplevel/" source /gscratch/srlab/programs/scripts/paths.sh #${bismark_dir}/bismark_genome_preparation \ #--verbose \ #--parallel 28 \ #--path_to_aligner ${bowtie2_dir} \ #${genome_folder} #/zr3644_11_R2.fastp-trim.20201206.fq.gz find ${reads_dir}*_R1.fastp-trim.20201206.fq.gz \ | xargs basename -s _R1.fastp-trim.20201206.fq.gz | xargs -I{} ${bismark_dir}/bismark \ --path_to_bowtie ${bowtie2_dir} \ -genome ${genome_folder} \ -p 4 \ -score_min L,0,-0.6 \ --non_directional \ -1 ${reads_dir}{}_R1.fastp-trim.20201206.fq.gz \ -2 ${reads_dir}{}_R2.fastp-trim.20201206.fq.gz \ find *.bam | \ xargs basename -s .bam | \ xargs -I{} ${bismark_dir}/deduplicate_bismark \ --bam \ --paired \ {}.bam ${bismark_dir}/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 14 \ --buffer_size 75% \ *deduplicated.bam # Bismark processing report ${bismark_dir}/bismark2report #Bismark summary report ${bismark_dir}/bismark2summary # Sort files for methylkit and IGV find *deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} ${samtools} \ sort --threads 28 {}.bam \ -o {}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find *.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} ${samtools} \ index -@ 28 {}.sorted.bam # # # find *deduplicated.bismark.cov.gz \ # | xargs basename -s _R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz \ # | xargs -I{} ${bismark_dir}/coverage2cytosine \ # --genome_folder ${genome_folder} \ # -o {} \ # --merge_CpG \ # --zero_based \ # {}_R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz # # # #creating bedgraphs post merge # # for f in *merged_CpG_evidence.cov # do # STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) # cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \ # > "${STEM}"_10x.bedgraph # done # # # # for f in *merged_CpG_evidence.cov # do # STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) # cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \ # > "${STEM}"_5x.bedgraph # done # # # #creating tab files with raw count for glms # # for f in *merged_CpG_evidence.cov # do # STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) # cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4, $5, $6}}' \ # > "${STEM}"_10x.tab # done # # # for f in *merged_CpG_evidence.cov # do # STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov) # cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4, $5, $6}}' \ # > "${STEM}"_5x.tab # done
[code]#!/bin/bash ## Job Name #SBATCH...
#!/bin/bash ## Job Name #SBATCH --job-name=re-duck ## Allocation Definition #SBATCH --account=srlab #SBATCH --partition=srlab ## Resources ## Nodes (We only get 1, so this is fixed) #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=30-00:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --chdir=/gscratch/scrubbed/sr320/1217/ # Directories and programs bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" reads_dir="/gscratch/srlab/strigg/data/Pgenr/FASTQS/" genome_folder="/gscratch/srlab/sr320/data/geoduck/v01/" source /gscratch/srlab/programs/scripts/paths.sh ${bismark_dir}/bismark_genome_preparation \ --verbose \ --parallel 28 \ --path_to_aligner ${bowtie2_dir} \ ${genome_folder} find ${reads_dir}*_R1_001_val_1.fq.gz \ | xargs basename -s _R1_001_val_1.fq.gz | xargs -I{} ${bismark_dir}/bismark \ --path_to_bowtie ${bowtie2_dir} \ -genome /gscratch/srlab/sr320/data/geoduck/v01 \ -p 4 \ -score_min L,0,-0.6 \ -1 /gscratch/srlab/strigg/data/Pgenr/FASTQS/{}_R1_001_val_1.fq.gz \ -2 /gscratch/srlab/strigg/data/Pgenr/FASTQS/{}_R2_001_val_2.fq.gz \ find *.bam | \ xargs basename -s .bam | \ xargs -I{} ${bismark_dir}/deduplicate_bismark \ --bam \ --paired \ {}.bam ${bismark_dir}/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 14 \ --buffer_size 75% \ *deduplicated.bam # Bismark processing report ${bismark_dir}/bismark2report #Bismark summary report ${bismark_dir}/bismark2summary # Sort files for methylkit and IGV find *deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} ${samtools} \ sort --threads 28 {}.bam \ -o {}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find *.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} ${samtools} \ index -@ 28 {}.sorted.bam find *deduplicated.bismark.cov.gz \ | xargs basename -s _R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz \ | xargs -I{} ${bismark_dir}/coverage2cytosine \ --genome_folder ${genome_folder} \ -o {} \ --merge_CpG \ --zero_based \ {}_R1_001_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz
[code]#!/bin/bash ## Job Name #SBATCH...
#!/bin/bash ## Job Name #SBATCH --job-name=bm ## Allocation Definition #SBATCH --account=coenv #SBATCH --partition=coenv ## Resources ## Nodes (We only get 1, so this is fixed) #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=06-00:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --chdir=/gscratch/scrubbed/sr320/0923/ # Directories and programs bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" reads_dir="/gscratch/srlab/sr320/data/caligus/" source /gscratch/srlab/programs/scripts/paths.sh find ${reads_dir}*_L001_R1_001_val_1_val_1.fq.gz \ | xargs basename -s _L001_R1_001_val_1_val_1.fq.gz | xargs -I{} ${bismark_dir}/bismark \ --path_to_bowtie ${bowtie2_dir} \ -genome /gscratch/srlab/sr320/data/geoduck/v074 \ -p 4 \ -score_min L,0,-0.6 \ -1 /gscratch/srlab/sr320/data/caligus/{}_L001_R1_001_val_1_val_1.fq.gz \ -2 /gscratch/srlab/sr320/data/caligus/{}_L001_R2_001_val_2_val_2.fq.gz \ find *.bam | \ xargs basename -s .bam | \ xargs -I{} ${bismark_dir}/deduplicate_bismark \ --bam \ --paired \ {}.bam ${bismark_dir}/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 14 \ --buffer_size 75% \ *deduplicated.bam # Bismark processing report ${bismark_dir}/bismark2report #Bismark summary report ${bismark_dir}/bismark2summary # Sort files for methylkit and IGV find *deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} ${samtools} \ sort --threads 28 {}.bam \ -o {}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find *.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} ${samtools} \ index -@ 28 {}.sorted.bam
[code]#!/bin/bash ## Job Name -...
#!/bin/bash ## Job Name - can be changed #SBATCH --job-name=bs-geo ## Allocation Definition - confirm correctness #SBATCH --account=coenv #SBATCH --partition=coenv ## Resources ## Nodes (often you will only use 1) #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=30-00:00:00 ## Memory per node #SBATCH --mem=100G ## email notification #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --workdir= /gscratch/scrubbed/sr320/0719/ # Exit script if a command fails # set -e ########################## # This is a script written to assess bisulfite sequencing reads # using Bismark. The user needs to supply the following: # 1. A single directory location contaning BSseq reads. # 2. BSseq reads need to be gzipped FastQ and end with .fq.gz # 3. A bisulfite-converted genome, produced with Bowtie2. # 4. Indicate if deduplication should be performed (whole genome sbator reduced genome sequencing) # # Set these values below ### USER NEEDS TO SET VARIABLES FOR THE FOLLOWING: # Set --workdir= path in SBATCH header above. # # Full path to directory with sequencing reads reads_dir="/gscratch/srlab/strigg/data/Pgenr/FASTQS" # Full path to bisulftie-converted genome directory genome_dir="/gscratch/srlab/sr320/data/geoduck/v074" # Enter y (for yes) or n (for no) between the quotes. # Yes - Whole genome bisulfite sequencing, MBD. # No - Reduced genome bisulfite sequencing (e.g. RRBS) deduplicate="No" # Run Bismark on desired number of reads/pairs subset # The default value is 0, which will run Bismark on all reads/pairs subset="-u 0" #################################################### # DO NOT EDIT BELOW THIS LINE #################################################### # Evaluate user-edited variables to make sure they have been filled [ -z ${deduplicate} ] \ && { echo "The deduplicate variable is not defined. Please edit the SBATCH script and add y or n to deduplicate variable."; exit 1; } [ -z ${genome_dir} ] \ && { echo "The bisulfite genome directory path has not been set. Please edit the SBATCH script."; exit 1; } [ -z ${reads_dir} ] \ && { echo "The reads directory path has not been set. Please edit the SBATCH script."; exit 1; } # Directories and programs wd=$(pwd) bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0_dev" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" threads="28" reads_list="input_fastqs.txt" ## Concatenated FastQ Files R1="" R2="" # Initialize arrays R1_array=() R2_array=() # Create list of input FastQ files for easier confirmation. for fastq in ${reads_dir}/*.fq.gz do echo ${fastq##*/} >> ${reads_list} done # Check for paired-end # Capture grep output # >0 means single-end reads # set +e/set -e prevents error >0 from exiting script set +e grep "_R2_" ${reads_list} paired=$? set -e # Confirm even number of FastQ files num_files=$(wc -l < ${reads_list}) fastq_even_odd=$(echo $(( ${num_files} % 2 )) ) ## Save FastQ files to arrays R1_array=(${reads_dir}/*_R1_*.fq.gz) ## Send comma-delimited list of R1 FastQ to variable R1=$(echo ${R1_array[@]} | tr " " ",") # Evaluate if paired-end FastQs # Run Bismark as paired-end/single-end based on evaluation if [[ ${paired} -eq 0 ]]; then # Evaluate if FastQs have corresponding partner (i.e. R1 and R2 files) # Evaluated on even/odd number of files. if [[ ${fastq_even_odd} -ne 0 ]]; then { echo "Missing at least one FastQ pair from paired-end FastQ set."; \ echo "Please verify input FastQs all have an R1 and corresponding R2 file."; exit 1; \ } fi ## Save FastQ files to arrays R2_array=(${reads_dir}/*_R2_*.fq.gz) ## Send comma-delimited list of R2 FastQ to variable R2=$(echo ${R2_array[@]} | tr " " ",") # Run bismark using bisulftie-converted genome # Generates a set of BAM files as outputs # Records stderr to a file for easy viewing of Bismark summary info ${bismark_dir}/bismark \ --path_to_bowtie2 ${bowtie2_dir} \ --genome ${genome_dir} \ --samtools_path=${samtools} \ --non_directional \ --score_min L,0,-0.6 \ ${subset} \ -p ${threads} \ -1 ${R1} \ -2 ${R2} \ 2> bismark_summary.txt else # Run Bismark single-end ${bismark_dir}/bismark \ --path_to_bowtie2 ${bowtie2_dir} \ --genome ${genome_dir} \ --samtools_path=${samtools} \ --non_directional \ ${subset} \ -p ${threads} \ ${R1} \ 2> bismark_summary.txt fi # Determine if deduplication is necessary # Then, determine if paired-end or single-end if [ ${deduplicate} == "y" ]; then # Sort Bismark BAM files by read names instead of chromosomes find *.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${samtools} sort \ --threads ${threads} \ -n bam_basename.bam \ -o bam_basename.sorted.bam if [ ${paired} -eq 0 ]; then # Deduplication find *sorted.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${bismark_dir}/deduplicate_bismark \ --paired \ --samtools_path=${samtools} \ bam_basename.bam else find *sorted.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${bismark_dir}/deduplicate_bismark \ --single \ --samtools_path=${samtools} \ bam_basename.bam fi # Methylation extraction # Extracts methylation info from deduplicated BAM files produced by Bismark # Options to created a bedgraph file, a cytosine coverage report, counts, remove spaces from names # and to use the "scaffolds" setting. ${bismark_dir}/bismark_methylation_extractor \ --bedGraph \ --cytosine_report \ --genome_folder ${genome_dir} \ --gzip --counts \ --scaffolds \ --remove_spaces \ --multicore ${threads} \ --buffer_size 75% \ --samtools_path=${samtools} \ *deduplicated.bam # Sort deduplicated BAM files find *deduplicated.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${samtools} sort \ --threads ${threads} \ bam_basename.bam \ -o bam_basename.sorted.bam # Index sorted files for IGV # The "-@ ${threads}" below specifies number of CPU threads to use. find *deduplicated.sorted.bam \ | xargs -I sorted_bam \ ${samtools} index \ -@ ${threads} \ sorted_bam else # Methylation extraction # Extracts methylation info from BAM files produced by Bismark # Options to created a bedgraph file, a cytosine coverage report, counts, remove spaces from names # and to use the "scaffolds" setting. ${bismark_dir}/bismark_methylation_extractor \ --bedGraph \ --cytosine_report \ --genome_folder ${genome_dir} \ --gzip \ --counts \ --scaffolds \ --remove_spaces \ --multicore ${threads} \ --buffer_size 75% \ --samtools_path=${samtools} \ *.bam # Sort BAM files find *.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${samtools} sort \ --threads ${threads} \ -o bam_basename.sorted.bam # Index sorted files for IGV # The "-@ ${threads}" below specifies number of CPU threads to use. find *sorted.bam \ | xargs -I sorted_bam \ ${samtools} index \ -@ ${threads} \ sorted_bam fi # Bismark processing report # Generates HTML reports from previously created files ${bismark_dir}/bismark2report #Bismark summary report # Generates HTML summary reports from previously created files ${bismark_dir}/bismark2summary
[code][sr320@mox2 2019]$ cat 0626_1311.sh #!/bin/bash...
[sr320@mox2 2019]$ cat 0626_1311.sh #!/bin/bash ## Job Name - can be changed #SBATCH --job-name=bs-dedup-ch ## Allocation Definition - confirm correctness #SBATCH --account=srlab #SBATCH --partition=srlab ## Resources ## Nodes (often you will only use 1) #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=30-00:00:00 ## Memory per node #SBATCH --mem=100G ## email notification #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --workdir= /gscratch/scrubbed/sr320/0626_1311/ # Exit script if a command fails # set -e ########################## # This is a script written to assess bisulfite sequencing reads # using Bismark. The user needs to supply the following: # 1. A single directory location contaning BSseq reads. # 2. BSseq reads need to be gzipped FastQ and end with .fq.gz # 3. A bisulfite-converted genome, produced with Bowtie2. # 4. Indicate if deduplication should be performed (whole genome or reduced genome sequencing) # # Set these values below ### USER NEEDS TO SET VARIABLES FOR THE FOLLOWING: # Set --workdir= path in SBATCH header above. # # Full path to directory with sequencing reads reads_dir="/gscratch/srlab/strigg/data/Pgenr/FASTQS" # Full path to bisulftie-converted genome directory genome_dir="/gscratch/srlab/sr320/data/geoduck/v074" # Enter y (for yes) or n (for no) between the quotes. # Yes - Whole genome bisulfite sequencing, MBD. # No - Reduced genome bisulfite sequencing (e.g. RRBS) deduplicate="YES" # Run Bismark on desired number of reads/pairs subset # The default value is 0, which will run Bismark on all reads/pairs subset="-u 100000" #################################################### # DO NOT EDIT BELOW THIS LINE #################################################### # Evaluate user-edited variables to make sure they have been filled [ -z ${deduplicate} ] \ && { echo "The deduplicate variable is not defined. Please edit the SBATCH script and add y or n to deduplicate variable."; exit 1; } [ -z ${genome_dir} ] \ && { echo "The bisulfite genome directory path has not been set. Please edit the SBATCH script."; exit 1; } [ -z ${reads_dir} ] \ && { echo "The reads directory path has not been set. Please edit the SBATCH script."; exit 1; } # Directories and programs wd=$(pwd) bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0_dev" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" threads="28" reads_list="input_fastqs.txt" ## Concatenated FastQ Files R1="" R2="" # Initialize arrays R1_array=() R2_array=() # Create list of input FastQ files for easier confirmation. for fastq in ${reads_dir}/*.fq.gz do echo ${fastq##*/} >> ${reads_list} done # Check for paired-end # Capture grep output # >0 means single-end reads # set +e/set -e prevents error >0 from exiting script set +e grep "_R2_" ${reads_list} paired=$? set -e # Confirm even number of FastQ files num_files=$(wc -l < ${reads_list}) fastq_even_odd=$(echo $(( ${num_files} % 2 )) ) ## Save FastQ files to arrays R1_array=(${reads_dir}/*_R1_*.fq.gz) ## Send comma-delimited list of R1 FastQ to variable R1=$(echo ${R1_array[@]} | tr " " ",") # Evaluate if paired-end FastQs # Run Bismark as paired-end/single-end based on evaluation if [[ ${paired} -eq 0 ]]; then # Evaluate if FastQs have corresponding partner (i.e. R1 and R2 files) # Evaluated on even/odd number of files. if [[ ${fastq_even_odd} -ne 0 ]]; then { echo "Missing at least one FastQ pair from paired-end FastQ set."; \ echo "Please verify input FastQs all have an R1 and corresponding R2 file."; exit 1; \ } fi ## Save FastQ files to arrays R2_array=(${reads_dir}/*_R2_*.fq.gz) ## Send comma-delimited list of R2 FastQ to variable R2=$(echo ${R2_array[@]} | tr " " ",") # Run bismark using bisulftie-converted genome # Generates a set of BAM files as outputs # Records stderr to a file for easy viewing of Bismark summary info ${bismark_dir}/bismark \ --path_to_bowtie2 ${bowtie2_dir} \ --genome ${genome_dir} \ --samtools_path=${samtools} \ --non_directional \ --score_min L,0,-0.6 \ ${subset} \ -p ${threads} \ -1 ${R1} \ -2 ${R2} \ 2> bismark_summary.txt else # Run Bismark single-end ${bismark_dir}/bismark \ --path_to_bowtie2 ${bowtie2_dir} \ --genome ${genome_dir} \ --samtools_path=${samtools} \ --non_directional \ ${subset} \ -p ${threads} \ ${R1} \ 2> bismark_summary.txt fi # Determine if deduplication is necessary # Then, determine if paired-end or single-end if [ ${deduplicate} == "y" ]; then # Sort Bismark BAM files by read names instead of chromosomes find *.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${samtools} sort \ --threads ${threads} \ -n bam_basename.bam \ -o bam_basename.sorted.bam if [ ${paired} -eq 0 ]; then # Deduplication find *sorted.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${bismark_dir}/deduplicate_bismark \ --paired \ --samtools_path=${samtools} \ bam_basename.bam else find *sorted.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${bismark_dir}/deduplicate_bismark \ --single \ --samtools_path=${samtools} \ bam_basename.bam fi # Methylation extraction # Extracts methylation info from deduplicated BAM files produced by Bismark # Options to created a bedgraph file, a cytosine coverage report, counts, remove spaces from names # and to use the "scaffolds" setting. ${bismark_dir}/bismark_methylation_extractor \ --bedGraph \ --cytosine_report \ --genome_folder ${genome_dir} \ --gzip --counts \ --scaffolds \ --remove_spaces \ --multicore ${threads} \ --buffer_size 75% \ --samtools_path=${samtools} \ *deduplicated.bam # Sort deduplicated BAM files find *deduplicated.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${samtools} sort \ --threads ${threads} \ bam_basename.bam \ -o bam_basename.sorted.bam # Index sorted files for IGV # The "-@ ${threads}" below specifies number of CPU threads to use. find *deduplicated.sorted.bam \ | xargs -I sorted_bam \ ${samtools} index \ -@ ${threads} \ sorted_bam else # Methylation extraction # Extracts methylation info from BAM files produced by Bismark # Options to created a bedgraph file, a cytosine coverage report, counts, remove spaces from names # and to use the "scaffolds" setting. ${bismark_dir}/bismark_methylation_extractor \ --bedGraph \ --cytosine_report \ --genome_folder ${genome_dir} \ --gzip \ --counts \ --scaffolds \ --remove_spaces \ --multicore ${threads} \ --buffer_size 75% \ --samtools_path=${samtools} \ *.bam # Sort BAM files find *.bam \ | xargs basename -s .bam \ | xargs -I bam_basename \ ${samtools} sort \ --threads ${threads} \ -o bam_basename.sorted.bam # Index sorted files for IGV # The "-@ ${threads}" below specifies number of CPU threads to use. find *sorted.bam \ | xargs -I sorted_bam \ ${samtools} index \ -@ ${threads} \ sorted_bam fi # Bismark processing report # Generates HTML reports from previously created files ${bismark_dir}/bismark2report #Bismark summary report # Generates HTML summary reports from previously created files ${bismark_dir}/bismark2summary[sr320@mox2 2019]$
[code] #!/bin/bash ## Job Name...
#!/bin/bash ## Job Name #SBATCH --job-name=oly-mbd ## Allocation Definition #SBATCH --account=coenv #SBATCH --partition=coenv ## Resources ## Nodes (We only get 1, so this is fixed) #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=00-100:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --workdir=/gscratch/srlab/sr320/analyses/2019/0327 # Directories and programs wd=$(pwd) bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0" bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/" samtools="/gscratch/srlab/programs/samtools-1.9/samtools" reads_dir="/gscratch/srlab/sr320/data/olurida-bs/" source /gscratch/srlab/programs/scripts/paths.sh find ${reads_dir}*_s456_trimmed.fq.gz \ | xargs basename -s _s456_trimmed.fq.gz | xargs -I{} ${bismark_dir}/bismark \ --path_to_bowtie ${bowtie2_dir} \ -genome /gscratch/srlab/sr320/data/olurida-genomes/v081 \ -p 14 \ --non_directional \ ${reads_dir}/{}_s456_trimmed.fq.gz ${bismark_dir}/deduplicate_bismark \ --bam -p \ *.bam ${bismark_dir}/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 14 \ *deduplicated.bam # Bismark processing report ${bismark_dir}/bismark2report #Bismark summary report ${bismark_dir}/bismark2summary # Sort files for methylkit and IGV find *deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} ${samtools} \ sort --threads 28 {}.bam \ -o {}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find *.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} ${samtools} \ index -@ 28 {}.sorted.bam
[code][sr320@mox1 jobs]$ cat 1024_1200.sh #!/bin/bash...
[sr320@mox1 jobs]$ cat 1024_1200.sh #!/bin/bash ## Job Name #SBATCH --job-name=oakl ## Allocation Definition #SBATCH --account=coenv #SBATCH --partition=coenv ## Resources ## Nodes (We only get 1, so this is fixed) #SBATCH --nodes=1 ## Walltime (days-hours:minutes:seconds format) #SBATCH --time=00-100:00:00 ## Memory per node #SBATCH --mem=100G #SBATCH --mail-type=ALL #SBATCH --mail-user=sr320@uw.edu ## Specify the working directory for this job #SBATCH --workdir=/gscratch/srlab/sr320/analyses/1024 source /gscratch/srlab/programs/scripts/paths.sh find /gscratch/srlab/sr320/data/oakl/*_1.fq.gz \ | xargs basename -s _s1_R1_val_1.fq.gz | xargs -I{} /gscratch/srlab/programs/Bismark-0.19.0/bismark \ --path_to_bowtie /gscratch/srlab/programs/bowtie2-2.1.0 \ --score_min L,0,-1.2 \ -genome /gscratch/srlab/sr320/data/Cvirg-genome \ -p 28 \ -1 /gscratch/srlab/sr320/data/oakl/{}_s1_R1_val_1.fq.gz \ -2 /gscratch/srlab/sr320/data/oakl/{}_s1_R2_val_2.fq.gz \ /gscratch/srlab/programs/Bismark-0.19.0/deduplicate_bismark \ --bam -p \ /gscratch/srlab/sr320/analyses/1024/*.bam /gscratch/srlab/programs/Bismark-0.19.0/bismark_methylation_extractor \ --bedGraph --counts --scaffolds \ --multicore 14 \ /gscratch/srlab/sr320/analyses/1024/*deduplicated.bam # Bismark processing report /gscratch/srlab/programs/Bismark-0.19.0/bismark2report #Bismark summary report /gscratch/srlab/programs/Bismark-0.19.0/bismark2summary # Sort files for methylkit and IGV find /gscratch/srlab/sr320/analyses/1024/*deduplicated.bam | \ xargs basename -s .bam | \ xargs -I{} /gscratch/srlab/programs/samtools-1.9/samtools \ sort --threads 28 /gscratch/srlab/sr320/analyses/1024/{}.bam \ -o /gscratch/srlab/sr320/analyses/1024/{}.sorted.bam # Index sorted files for IGV # The "-@ 16" below specifies number of CPU threads to use. find /gscratch/srlab/sr320/analyses/1024/*.sorted.bam | \ xargs basename -s .sorted.bam | \ xargs -I{} /gscratch/srlab/programs/samtools-1.9/samtools \ index -@ 28 /gscratch/srlab/sr320/analyses/1024/{}.sorted.bam