[code][sr320@mox2 2019]$ cat 0626_1311.sh #!/bin/bash...

[sr320@mox2 2019]$ cat 0626_1311.sh
#!/bin/bash
## Job Name - can be changed
#SBATCH --job-name=bs-dedup-ch
## Allocation Definition - confirm correctness
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes (often you will only use 1)
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=30-00:00:00
## Memory per node
#SBATCH --mem=100G
## email notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=sr320@uw.edu
## Specify the working directory for this job
#SBATCH --workdir= /gscratch/scrubbed/sr320/0626_1311/
# Exit script if a command fails
# set -e

##########################
# This is a script written to assess bisulfite sequencing reads
# using Bismark. The user needs to supply the following:
# 1. A single directory location contaning BSseq reads.
# 2. BSseq reads need to be gzipped FastQ and end with .fq.gz
# 3. A bisulfite-converted genome, produced with Bowtie2.
# 4. Indicate if deduplication should be performed (whole genome or reduced genome sequencing)
#
# Set these values below



### USER NEEDS TO SET VARIABLES FOR THE FOLLOWING:
# Set --workdir= path in SBATCH header above.
#
# Full path to directory with sequencing reads
reads_dir="/gscratch/srlab/strigg/data/Pgenr/FASTQS"

# Full path to bisulftie-converted genome directory
genome_dir="/gscratch/srlab/sr320/data/geoduck/v074"

# Enter y (for yes) or n (for no) between the quotes.
# Yes - Whole genome bisulfite sequencing, MBD.
# No - Reduced genome bisulfite sequencing (e.g. RRBS)
deduplicate="YES"

# Run Bismark on desired number of reads/pairs subset
# The default value is 0, which will run Bismark on all reads/pairs
subset="-u 100000"

####################################################
# DO NOT EDIT BELOW THIS LINE
####################################################




# Evaluate user-edited variables to make sure they have been filled
[ -z ${deduplicate} ] \
&& { echo "The deduplicate variable is not defined. Please edit the SBATCH script and add y or n to deduplicate variable."; exit 1; }

[ -z ${genome_dir} ] \
&& { echo "The bisulfite genome directory path has not been set. Please edit the SBATCH script."; exit 1; }

[ -z ${reads_dir} ] \
&& { echo "The reads directory path has not been set. Please edit the SBATCH script."; exit 1; }



# Directories and programs
wd=$(pwd)
bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0_dev"
bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/"
samtools="/gscratch/srlab/programs/samtools-1.9/samtools"
threads="28"
reads_list="input_fastqs.txt"

## Concatenated FastQ Files
R1=""
R2=""

# Initialize arrays
R1_array=()
R2_array=()

# Create list of input FastQ files for easier confirmation.
for fastq in ${reads_dir}/*.fq.gz
do
  echo ${fastq##*/} >> ${reads_list}
done

# Check for paired-end
# Capture grep output
# >0 means single-end reads
# set +e/set -e prevents error >0 from exiting script
set +e
grep "_R2_" ${reads_list}
paired=$?
set -e

# Confirm even number of FastQ files
num_files=$(wc -l < ${reads_list})
fastq_even_odd=$(echo $(( ${num_files} % 2 )) )


## Save FastQ files to arrays
R1_array=(${reads_dir}/*_R1_*.fq.gz)
## Send comma-delimited list of R1 FastQ to variable
R1=$(echo ${R1_array[@]} | tr " " ",")

# Evaluate if paired-end FastQs
# Run Bismark as paired-end/single-end based on evaluation
if [[ ${paired} -eq 0 ]]; then
  # Evaluate if FastQs have corresponding partner (i.e. R1 and R2 files)
  # Evaluated on even/odd number of files.
  if [[ ${fastq_even_odd} -ne 0 ]]; then
    { echo "Missing at least one FastQ pair from paired-end FastQ set."; \
      echo "Please verify input FastQs all have an R1 and corresponding R2 file.";
      exit 1; \
    }
  fi
  ## Save FastQ files to arrays
  R2_array=(${reads_dir}/*_R2_*.fq.gz)
  ## Send comma-delimited list of R2 FastQ to variable
  R2=$(echo ${R2_array[@]} | tr " " ",")
  # Run bismark using bisulftie-converted genome
  # Generates a set of BAM files as outputs
  # Records stderr to a file for easy viewing of Bismark summary info
  ${bismark_dir}/bismark \
  --path_to_bowtie2 ${bowtie2_dir} \
  --genome ${genome_dir} \
  --samtools_path=${samtools} \
  --non_directional \
  --score_min L,0,-0.6 \
  ${subset} \
  -p ${threads} \
  -1 ${R1} \
  -2 ${R2} \
  2> bismark_summary.txt
else
  # Run Bismark single-end
  ${bismark_dir}/bismark \
  --path_to_bowtie2 ${bowtie2_dir} \
  --genome ${genome_dir} \
  --samtools_path=${samtools} \
  --non_directional \
  ${subset} \
  -p ${threads} \
  ${R1} \
  2> bismark_summary.txt
fi



# Determine if deduplication is necessary
# Then, determine if paired-end or single-end
if [ ${deduplicate} == "y"  ]; then
  # Sort Bismark BAM files by read names instead of chromosomes
  find *.bam \
  | xargs basename -s .bam \
  | xargs -I bam_basename \
  ${samtools} sort \
  --threads ${threads} \
  -n bam_basename.bam \
  -o bam_basename.sorted.bam
  if [ ${paired} -eq 0 ]; then
    # Deduplication
    find *sorted.bam \
    | xargs basename -s .bam \
    | xargs -I bam_basename \
    ${bismark_dir}/deduplicate_bismark \
    --paired \
    --samtools_path=${samtools} \
    bam_basename.bam
  else
    find *sorted.bam \
    | xargs basename -s .bam \
    | xargs -I bam_basename \
    ${bismark_dir}/deduplicate_bismark \
    --single \
    --samtools_path=${samtools} \
    bam_basename.bam
  fi
  # Methylation extraction
  # Extracts methylation info from deduplicated BAM files produced by Bismark
  # Options to created a bedgraph file, a cytosine coverage report, counts, remove spaces from names
  # and to use the "scaffolds" setting.
  ${bismark_dir}/bismark_methylation_extractor \
  --bedGraph \
  --cytosine_report \
  --genome_folder ${genome_dir} \
  --gzip
  --counts \
  --scaffolds \
  --remove_spaces \
  --multicore ${threads} \
  --buffer_size 75% \
  --samtools_path=${samtools} \
  *deduplicated.bam
  # Sort deduplicated BAM files
  find *deduplicated.bam \
  | xargs basename -s .bam \
  | xargs -I bam_basename \
  ${samtools} sort \
  --threads ${threads} \
  bam_basename.bam \
  -o bam_basename.sorted.bam
  # Index sorted files for IGV
  # The "-@ ${threads}" below specifies number of CPU threads to use.
  find *deduplicated.sorted.bam \
  | xargs -I sorted_bam \
  ${samtools} index \
  -@ ${threads} \
  sorted_bam
else
  # Methylation extraction
  # Extracts methylation info from BAM files produced by Bismark
  # Options to created a bedgraph file, a cytosine coverage report, counts, remove spaces from names
  # and to use the "scaffolds" setting.
  ${bismark_dir}/bismark_methylation_extractor \
  --bedGraph \
  --cytosine_report \
  --genome_folder ${genome_dir} \
  --gzip \
  --counts \
  --scaffolds \
  --remove_spaces \
  --multicore ${threads} \
  --buffer_size 75% \
  --samtools_path=${samtools} \
  *.bam

  # Sort BAM files
  find *.bam \
  | xargs basename -s .bam \
  | xargs -I bam_basename \
  ${samtools} sort \
  --threads ${threads} \
  -o bam_basename.sorted.bam
  # Index sorted files for IGV
  # The "-@ ${threads}" below specifies number of CPU threads to use.
  find *sorted.bam \
  | xargs -I sorted_bam \
  ${samtools} index \
  -@ ${threads} \
  sorted_bam
fi


# Bismark processing report
# Generates HTML reports from previously created files
${bismark_dir}/bismark2report

#Bismark summary report
# Generates HTML summary reports from previously created files
${bismark_dir}/bismark2summary[sr320@mox2 2019]$ 

#bismark, #sbatch

Shelly’s Notebook: Fri. Jun. 28, 2019 Salmon RRBS library prep

Test out digest and size selection

RRBS DIGEST

Prepared the following reaction 4x with Sample 3 gDNA (418ng/ul):

  • 2uL of DNA
  • 0.5uL MspI
  • 1uL CutSmart
  • 6uL H2O
  • incubate at 37C 1 hour
  • add 0.5uL Taq a1
  • incubate at 65C for 30min
  • let cool to RT then hold on ice

SIZE SELECTION

PART 1:

  • add 6uL beads to each sample (0.6X DNA:Bead ratio), vortex
  • incubate at RT 10 min
  • magnetic stand for 5 min
  • remove and save supernatent
      • one sample only had 14uL, not 16uL so that means it only had 8uL of digest reaction to start so ratio was actually 0.75X; not 0.6X

PART 2:

  • add:
    • 10uL beads (0.75X intial to 2X final)
    • 12uL beads (0.6X initial to 1.8X)
    • 14uL beads (0.6X initial to 2X final)
  • vortex samples and incubate at RT 10 min
  • magnetic stand 5 min
  • remove supernatent
  • Wash beads (from part 1 and part 2) with 500uL 80% EtOH
  • Repeat wash 1X
  • Let beads dry 2-5min at RT (until EtOH smell gone)
  • Resuspend beads in 12uL elution buffer (preheated to 50C)
  • Incubate at RT for 10 min
  • Bind beads 5 min
  • Transfer supernatent to clean tube
  • measure concentration of part 2 s/n
    • 0.75X to 2X = 3.74ng/ul = 45ng yield
    • 0.6X to 1.8X = 8.8ng/ul = 105ng yield
    • 0.6X to 2X = 10ng/ul = 120ng yield
    • STD2 = 9.98ng/ul

Load gel:

Lanes:

  1. Ladder (o’generuler 100bp-1kb ladder)
  2. 0.75X to 2X elution from part 1 (large frags excluded)
  3. 0.75X to 2X elution from part 2 (size selected frags)
  4. 0.75X to 2X supernatent from part 2 (small frags excluded)
  5. 0.6X to 1.8X elution from part 2 (size selected frags)
  6. 0.6X to 2X eluction from part 2 (size selected frags)
  7. 0.6X to 1.8X supernatent from part 2 (small frags excluded)
  8. 0.6X to 1.8X elution from part 1 (large frags excluded)

uc?export=view&id=1ZS6w8qQBRBFX6NnwBv2d7z8ZqUPqbVF_

CONCLUSIONS:

  • 1ug of DNA yields enough for the library prep after size selection
  • Don’t need to digest overnight; maybe digest for 2 hours with MspI and 1 hour for Taq a1; also will be using more enzyme in real RRBS run (30U and 60U, respectively). I only used 10U here
  • DNA:Bead ratios for size selection:
    • 0.75X to 2X gives range closest to 100-300bp
    • 0.6X keeps too many fragments > 300
    • 1.8X final ratio removes too many fragments around 100bp
    • If I do the lower selection 2x, I’ll have a better chance of removing small fragments

PLAN:

RRBS DIGEST

DNA: aliquot 1ug of DNA and add nanopure water up to 20uL so all DNA is 1ug in 20uL final vol.

Prepare the following reaction mix:

1 rxn 21 rxns reagent
20uL DNA
1.5uL 31.5uL MspI
3uL 63uL CutSmart
5.5uL 115.5uL H2O

Add 10uL reaction mix to each tube of 20uL of DNA for 30uL total rxn vol.

37C 2 hour

Prepare the following reaction mix:

1 rxn 21 rxns reagent
2.5uL 52.5uL Taq a1
1uL 21uL CutSmart
6.5uL 136.5uL H2O

add 10uL reaction mix to each tube (now 40uL total rxn vol.)

65C for 1 hour

let cool to RT then hold on ice

SIZE SELECTION

PART 1:

  • add 30uL beads to each sample (0.75X DNA:Bead ratio), vortex
  • incubate at RT 10 min
  • magnetic stand for 5 min
  • keep supernatent

PART 2:

  • add 50uL beads (0.75X intial to 2X final)
  • vortex samples and incubate at RT 10 min
  • magnetic stand 5 min
  • remove supernatent
  • Wash beads (from part 1 and part 2) with 500uL 80% EtOH
  • Repeat wash 1X
  • Let beads dry 2-5min at RT (until EtOH smell gone)
  • Resuspend beads in 10uL elution buffer (preheated to 60C)
  • Incubate at RT for 10 min
  • Bind beads 5 min
  • Transfer supernatent to clean tube

second purification step

  • add 20uL beads to each tube
  • vortex samples and incubate at RT 10 min
  • magnetic stand 5 min
  • remove supernatent
  • Wash beads (from part 1 and part 2) with 500uL 80% EtOH
  • Repeat wash 1X
  • Let beads dry 2-5min at RT (until EtOH smell gone)
  • Resuspend beads in 21uL elution buffer (preheated to 60C)
  • measure concentrations

from shellytrigg https://ift.tt/2XiwEDF
via IFTTT

Sam’s Notebook: Data Received – C.virginica Mantle MBD-BSseq from ZymoResearch

We received the BSseq data from ZymoResearch today. Samples were submitted on 20190326. The samples constituted 24 Crassostrea virginica mantle samples which were prepared using the MethylMiner Kit (Invitrogen) on 20190319.

FastQ data (each file beginning with zr2576) for each sample was provided in three FastQ files per sample (presumably the FastQ files were split by file size limits set by ZymoResearch). I will likely concatenate the corresponding R1 and R2 files for each sample so that we simply have a single R1 and single R2 file for each sample. That will be detailed in a subsequent notebook entry when it happens.

E.g. current FastQs:

  • zr2576_14_s1_R1.fastq.gz
  • zr2576_14_s2_R1.fastq.gz
  • zr2576_14_s3_R1.fastq.gz

Data was downloaded from ZymoResearch FTP server using FileZilla.

MD5 checksums were verified after downloading:

 md5sum --check zr2576_md5_checksums.txt  

Files were rsync‘d to owl/nightingales/C_virginica.

File info was partially added to the nightingales Google Sheet. Only partially due to the difficult nature of detailing FastQ data split across three files for each sample read set. This is the reason I will likely perform the FastQ concatenation – greatly simplifies data management.

The master sample sheet (Google Sheet) for the project was updated to reflect sequencing library names for each sample:

from Sam’s Notebook https://ift.tt/2LiNT0a
via IFTTT

Shelly’s Notebook: Tues. Jun. 25, 2019 Salmon RRBS prep plans

Summary of RRBS in the lit:

paper enzyme dig. time fragment size depth cov DMR window # DMRs
Rob MspI + Taqa1 ? 60-180 42M 6M CpGs @ 6X 500kb of diff. exp genes ~10K sites
Mog MspI + Taqa1 ? 60-180 34M didn’t say 1Mb of diff exp genes ~5.5K sites
Mac MspI o/n 100-300 38M 600K CpGs; 112K regions in all libs @ 10X 100bp region ~100 genes
Web MspI 12 hr 70-220 64M 21M CpGs; 335K @ 10X in all libs within a gene or +/- 1.5kb of TSS for promoters ~1.9K sites
Lee MspI + Taqa1 2 hr + 2 hr 80-160 5M 3.3% of CpGs @ 10X NA NA

Next steps

  1. Digest 2ug DNA
    • doing 2ug because I don’t know what the yield is going to be like after gel size-selcetion
    • get 2ug DNA in 35uL H2O
    • MSPI digest reaction mix:
      • 31.5uL MSPI (21 * 1.5uL MSP1/sample)
      • 105uL Buffer (21 * 5uL cutsmart/sample)
      • 178.5uL Water (21 * 8.5uL H2O/sample)
      • 315 = final vol.
      • add 15uL master mix/sample
    • incubate @ 37C 12 hrs
    • heat inactivate @ 80C 20 min
    • hold @ 4C
    • Taq-aI digest reaction mix:
      • 63 uL Taq-a1 (21 * 3uL Taq-a1/sample)
      • 21 uL Buffer (21*1uL cutsmart/sample)
      • 126 uL Water (21 * 6uL H2O/sample)
      • 210 = final vol.
    • add 10uL master mix/sample
    • incubate @ 65C 2 hrs
    • heat inactivate @ 80C 20 min
    • hold @ 4C
  2. Gel purify
    • excise fragments between 80-280bp (seems like 100-300bp would be fine)
      • there may be a small size range that we want to exclude, but it’s not clear what range and if its even possible. *see frag. size range info below
    • use Millipore columns
    • measure concentration with Qubit HS
  3. Begin zymo pico methyl kit

Notes on fragment size selection after Taq-aI/MspI digest

  • Lee paper selected 150-197bp and 207-230bp (and excluded 198-206bp) after adapter ligation
    • paper clarifies this range corresponds to 80bp-160bp fragments (before adapter ligation); suggesting adapters add 70 bp to the fragments.
    • the reason for the small size exclusion is because in silico analysis showed fragments within this size contained repetitive sequences that won’t map.
  • Mog. paper selected 150-250bp and 250-350bp after adapter ligation
    • I emailed Mog. to ask why two seperate ranges when they seem overlapping and confirm whether or not there was a fragment size exclusion
    • if the adapters used were the same length as in Lee et al, Mog. fragments pre-adapter ligation would be 80-280bp. This is pretty close to what Mac size-selected (100-300bp).

from shellytrigg https://ift.tt/2ZUHs7P
via IFTTT

Sam’s Notebook: Data Wrangling – FastA Subsetting of Pgenerosa_v070.fa Using samtools faidx

Steven asked to subset the Pgenerosa_v070.fa (2.1GB) in this GitHub Issue #705. In that issue, it was determined that a significant portion of the sequencing data that was assembled by Phase Genomics clustered in “scaffolds” 1 – 18. As such, Steven asked to subset just those 18 scaffolds.

This was done by using the samtools faidx program.

Process is documented in the following Jupyter Notebook (GitHub):

Yaamini’s Notebook: DML Analysis Part 37

Gene descriptions and enrichment

With my finalized DML, I decided to describe the genes they are in. Steven suggested I start by making tables annotating the genes DML overlap with. Once I realized this would just involve some (hopefully) quick R manipulation, I created a grand plan:

IMG_0378

Figure 1. The grand plan.

You’ll notice that the grand plan includes DMR…but I’m dropping the DMR. After thinking about it more and discussing the DMR at the Eastern Oyster Project Meeting, I think they’re just artificial and don’t make sense when I have loci-level resolution.

To do this, I first wanted to create a master DML annotation list. Then, I could subset the master list based on overlaps with putative promoters and gene overlaps. I could further subset the annotated DML that overlap with genes into exon and intron overlaps. For all gene, exon, and intron overlaps, I can separate hypermethylated and hypomethylated loci. Separately, I can create a list of DML annotations with transposable elements. Since there are only 57 transposable element overlaps, I don’t think I need to create subsets. Once I found some time to actually work in the midst of summer TAing at Friday Harbor, this R Markdown file and got cranking.

Master DML-gene annotation

I wanted to create a master list with the following columns:

  • chr
  • start
  • end
  • p-value
  • q-value
  • meth.diff
  • dist-to-closest-gene
  • gene-chr
  • gene-start
  • gene-end
  • gene-ID
  • mRNA-start
  • mRNA-end
  • genbank
  • uniprot-ID
  • e-value
  • product

DML-closestBed

I started with my DML csv file and merged that with the information from closestBed. To get this merge to work, I needed to switch the positions of my DML and genes so closestBed would report the closest gene to each DML, and not the closest DML to each gene in this Jupyter notebook:

 ! {bedtoolsDirectory}closestBed \ -a {DMLlist} \ -b {geneList} \ -t all \ -D ref \ > 2019-05-29-Flanking-Analysis/2019-05-29-Genes-Closest-NoOverlap-DMLs.txt #Although the file name says "NoOverlaps," overlaps are represented by "0" in distance column.  

When I imported the dataset into R, I had to subtract 1 from the start position and add 1 to the end position so it lined up with the DML list. I merged the two files by DML start and end positions.

DML-closestBed-geneList

The next file I wanted to add to my master dataset was the gene list. I easily imported the gff3 with read.delim, but my challenge came from sorting through the last column with gene description information. I usually use Excel Text-to-Columns for this, but I decided I needed a reproducible bash-wizardry method.

First, I cut the column with the gene descriptions and replaced ; with \tab using awk and gsub:

 cut -f9 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_gene_sorted_yrv.gff3 \ | awk '{gsub(";","\t",$0); print;}' \ > 2019-06-20-Gene-Descriptions.txt #Isolate gene description column and convert ";" to "\t". Save the output as a new file  

This gave me a tab-delimited gene description output file. I tried importing this into R, but found that if a row had more columns than the first row, the information from those columns wrapped around to create an additional row. The hivemind of Stack Overflow told me to cound the number of columns in each row, then set col.names in read.delim with enough column names for the maxiumum number of columns. I used awk to count the number of columns in each row, then saved that in this file.

 awk '{print NF}' 2019-06-20-Gene-Descriptions.txt > 2019-06-20-Gene-Description-Column-Numbers.txt  

I imported the column number file into R and found the maximum number of columns was 9 with max. I set col.names in read.delim to accomodate 9 columns:

 geneDescriptions <- read.delim2("2019-06-20-Gene-Descriptions.txt", header = FALSE, col.names = c("gene-number", "gene-ID", "V3", "V4", "V5", "V6", "V7", "V8", "V9")) #Import document with gene descrptions. Name 9 columns so all rows are imported with 9 columns  

I wasn’t done yet! I ended up with gene IDs in a column, but there were extraneous characters before the actual ID that I did not need. I removed these with gsub:

 geneID <- geneDescriptions$gene.ID #Isolate gene ID column geneID <- gsub("Dbxref=GeneID:", "", geneID) #Remove extraneous characters  

Now that the geneID column was clean, I added the column to my gene list (I just created a dataframe, but I now realize I could have used cbind). Finally, I merged the revised gene list with isolated gene IDs with my master list.

DML-closestBed-geneList-mRNAList

The gene list has gene IDs, but the Uniprot codes I have are matched with Genbank IDs from an mRNA file. The next intermediate step is to merge then mRNA list with my master list. Once again, I started by isolataing the mRNA description column with cut and using awk and gsub to replace “;” and “,” delimiters with “\t”.

 cut -f9 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_mRNA_yrv.gff3 \ | awk '{gsub(";","\t",$0); print;}' \ | awk '{gsub(",","\t",$0); print;}' \ > 2019-06-20-mRNA-Descriptions.txt #Isolate mRNA description column and convert ";" and "," to "\t". Save the output as a new file  

I counted the number of columns with awk and imported the file into R, setting a maximum number of columns with col.names in read.delim. From the mRNA descriptions, I isolated the gene ID that matched with the gene track, and the Genbank ID to match with Uniprot codes:

 geneIDmRNAList <- mRNADescriptions$V3 #Isolate gene ID column geneIDmRNAList <- gsub("Dbxref=GeneID:", "", geneIDmRNAList) #Remove extraneous characters  
 genbankIDmRNAList <- mRNADescriptions$V4 #Isolate Genbank ID genbankIDmRNAList <- gsub("Genbank:", "", genbankIDmRNAList) #Remove extraneous characters  

In addition to getting gene and Genbank IDs, I wanted to get mRNA product descriptions from the amalgam of information in that description column. Since each row can have a different number of delimiters, I decided to use cut and awk with gsub to essentially set a custom delimiter. The products are denoted after “product=”, so I set this as my new delimiter. This created a bunch of garbage in one column, and then product information and garbage in a second column. I then piped the that output into another cut command, where I isolated the second column with the product information. I used another awk and gsub to change “;” delimiters to “\t”. My final file, found here, has products in the first column, then garbage in any remaining columns.

 cut -f9 ../2019-05-13-Generating-Genome-Feature-Tracks/C_virginica-3.0_Gnomon_mRNA_yrv.gff3 \ | awk '{gsub("product=","\t",$0); print;}' \ | cut -f2 \ | awk '{gsub(";","\t",$0); print;}' \ > 2019-06-20-mRNA-Products.txt #Isolate mRNA description column and convert "product=" to "\t". This separates the descriptions into 2 columns. The beginning of second column has the product description. Take the second column of the output and convert ";" to "\tab" to better isolate mRNA products. Save the output as a new file.  

Just like I did previously, I counted the number of columns and imported the file with read.delim and the appropriate number of col.names. I then isolated the first column with product descriptions, and appended that to the mRNA list with gene IDs and Genbank IDs. Finally, I merged the massive mRNA list with the master list.

DML-closestBed-geneList-mRNAList-Uniprot

Finally, I’ll merged the working master with Uniprot codes by matching Genbank IDs. This was the easiest step. I reorganized the columns and ensured I didn’t have any duplicate entries by using distinct in the dplyr package. I exported the file as this .csv.

Gene subsets

Now that I had the master list, the easier part was subsetting files using subset. I started by subsetting DML-gene overlaps by defining dist-to-closest-gene == 0 as my subset. I wrote out the file here. From the DML-gene overlaps, I used a meth.diff > 0 subset to get hypermethylated DML-gene overlaps and meth.diff < 0 to get hypomethylated DML-gene overlaps.

Master DML-exon and DML-intron annotations

Even though I had DML-gene overlaps annotated, I wanted to annotate DML-exon overlaps. I figured the most painless way to do this would be to obtain exon-gene overlaps in bedtools, then merge those overlaps with the DML-gene overlap annotations by gene start and end positions. In this Jupyter notebook, I used intersectBed to identify exon-gene overlaps.

When I imported this into R, I realized it was not necessary! My gene subset has DML start and end positions, which I could match with DML-exon overlaps. Womp womp. Since the DML-exon overlap file was a BEDtools output, I needed to add 1 to the start position and subtract 1 from the end position. I could then merge the DML-exon overlaps with the DML-gene annotation by DML start position. With merge, I set sort = FALSE so my output would not be sorted by the start column. I also included all.x = TRUE so even if an exon did not match with a gene, it was still retained as a row in the final table. This would be important for including exons from coding sequences that did not overlap with genes. I saved the output here. I also subsetted annotation tables for hypermethylated DML-exon overlaps and hypomethylated DML-exon overlaps. I repeated this process with DML-intron overlaps. I obtained a DML-intron annotation table for all DML, hypermethylated DML, and hypomethylated DML.

Master DML-promoter annotation

The dist-to-closest-gene column has base pair distances from DML to the closest genes, but I defined my putative promoter regions as 1 kb regions upstream of transcription start sites. Even though my dist-to-closest-gene column has important information, I need to look at distances between DML and mRNA. I quickly detoured to this Jupyter notebook to rerun intersectBed. When I originally created my overlap files, I wrote out information from only the flanks without matching DML. I changed the -wb argument to -wo to get information from both files in the output. I then imported the DML-promoter overlaps and merged that with the master annotation list by duplicating the column with the end of the promoter and naming it the start of the mRNA. I saved the annotated DML-promoter table here.

Master DML-TE annotation

I went through a similar process for the DML-TE annotation. I imported the DML-TE overlap file. I also revised intersectBed by swapping -wb for -wo and producing this Gene-TE overlap file. I merged the documents by TE start and end positions, then merged the resultant output with the master table. Similar to the other tables I created, I retained all rows from the original overlap file I was interested in and removed any duplicate rows (merging artifacts) with distinct. The DML-TE annotations can be found here.

And that’s how you end up with an R Markdown file that’s close to 600 lines long just to subset some files. Join me tomorrow for “how the heck to I adequately describe this” and “will a gene enrichment yield interesting results.”

Going forward

  1. Finish gene description and enrichment
  2. Work through gene-level analysis
  3. Update methods and results
  4. Update paper repository
  5. Outline the discussion
  6. Write the discussion
  7. Write the introduction
  8. Revise my abstract
  9. Share the draft with collaborators and get feedback
  10. Post the paper on bioRXiv
  11. Prepare the manuscript for publication

/**

  • RECOMMENDED CONFIGURATION VARIABLES: EDIT AND UNCOMMENT THE SECTION BELOW TO INSERT DYNAMIC VALUES FROM YOUR PLATFORM OR CMS.
  • LEARN WHY DEFINING THESE VARIABLES IS IMPORTANT: http://bit.ly/1PPNtdx/ / var disqus_config = function () { this.page.url = PAGE_URL; // Replace PAGE_URL with your page’s canonical URL variable this.page.identifier = PAGE_IDENTIFIER; // Replace PAGE_IDENTIFIER with your page’s unique identifier variable }; */ (function() { // DON’T EDIT BELOW THIS LINE var d = document, s = d.createElement(‘script’); s.src = ‘https://the-responsible-grad-student.disqus.com/embed.js’; s.setAttribute(‘data-timestamp’, +new Date()); (d.head || d.body).appendChild(s); })(); </script>

Please enable JavaScript to view the comments powered by Disqus.

from the responsible grad student http://bit.ly/2KDvqMg
via IFTTT

Laura’s Notebook: June 2019 Animal Inventory

Live Olympia oysters

Checked on leftover Olys from my 2017 and 2018 Manchester experiments yesterday (6/16/19) which were housed at Manchester on the beach and dock.

Beach Status

In June 2018 I put mini silos with Oly set – 1 from each “family” – into 3 big oyster bags, and Stuart put them onto the beach in Clam Bay, where they stayed for 12 months. I had cut the silo screen out, so there was plenty of waterflow. Yesterday I walked out at low tide and collected the bags. They were shockingly clean – just some kelp growing on the top side of the bag. I only found a few barnacles set on silos. Very few Olys survived:

[hold for data table]

In one of the bags I had placed a HOBO pendant data logger. Here’s a snapshot of the temperature (black) and light intensity (blue) data:

HOBO temp from Clam Bay

Dock Status

I have 3 cages of Olys hanging from the dock, which contain

a) Remnant F1 Olys from Jake Heare, which I used in my OA/T experiment and exposed to varying temp and pCO2 levels (probably includes soem of of Katherine’s Oyster Bay F2’s, too).

b) Offspring from my Oly OA/T 2017-2018 experiment, bagged into window screen pouches. Not sure how many I have of each group, but there were tons of live oysters.

c) Offspring from my Oly temperature/food 2018 experiment. They were bagged into 450um mesh silos, and had now grown much at all. They were alive, though!