Kaitlyn’s notebook: C. virginica and C. gigas

I’ve been working on theseissues which means I have been analyzing the methylation dataset for different populations of eastern oysters, as well as continuing to work on the WSG publication.

CpG O/E Eastern oyster

cpg <- read.csv("Documents/robertslab/work/CpG/eastern-oyster/ID_CpG_labelled_all.csv")

single q-q plot

data <- cpg$CL_1
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = " ", main2 = "CL_1", font.main = 3)
lines(density(filt.data), lty=2, lwd=2)

faceted q-q plots

library(mixtools)

colname.list <- colnames(cpg)

jpeg(filename = "Documents/robertslab/work/faceted_cpg.jpeg", width = 1000, height = 1000)

a <- par(mfrow = c(10, 9)) # r x c plot

for(i in 2:length(colname.list)){
data <- cpg[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = " ", main2 = colname.list[[1]][i], font.main = 3)
lines(density(filt.data), lty=2, lwd=2)
print(mean(cpg[[i]]))
}

dev.off()

pull out stress response genes

GOslim <- read.csv("Documents/robertslab/work/CpG/eastern-oyster/Blastquery-GOslim.csv", header=FALSE)
GOslim$V1 <- gsub(".*?NC","NC", GOslim$V1) #remove any artifacts from excel

stress.GO.nolistnovalues <- GOslim[grep("stress", GOslim$V3), ]
write.csv(merge(stress.GOslim, cpg, by.x="V1", by.y="ID"), "Documents/robertslab/work/CpG/eastern-oyster/stress-annotated.csv")
#2733 stress IDs identified

BPspread <- aggregate(V3~V1,GOslim,FUN=paste)
GOspread <- aggregate(V2~V1,GOslim,FUN=paste)
GOBP.spread <- merge(BPspread, GOspread, by="V1")
class(GOBP.spread$V3)

stress.GOBP.list <- GOBP.spread[grep("stress", GOBP.spread$V3), ]

stress.cpg <- merge(stress.GOBP.list, cpg, by.x = "V1", by.y="ID")

scatter plots with mean, median and range of values in original cpg table

rownames(cpg) <- cpg$ID
cpg <- cpg[,-c(1)]

library(matrixStats)

stats <- data.frame(colMeans(cpg))
colnames(stats)[1] <- "mean"
stats$median <- colMedians(as.matrix(cpg))
#stats$range <- colRanges(as.matrix(cpg)) #values too small
#stats <- subset(stats, select=-c(range))
stats$skew <- colskewness(as.matrix(cpg))
stats$kurtosis <- colkurtosis(as.matrix(cpg))
stats$variance <- colVars(as.matrix(cpg))
stats$stdev <- colVars(as.matrix(cpg), std = TRUE)

stats$ID <- rownames(stats)

aa <- ggplot(stats, aes(x=ID, y=mean)) + geom_point()
bb <- ggplot(stats, aes(x=ID, y=median)) + geom_point()
cc <- ggplot(stats, aes(x=ID, y=kurtosis)) + geom_point()
dd <- ggplot(stats, aes(x=ID, y=skew)) + geom_point()
ee <- ggplot(stats, aes(x=ID, y=variance)) + geom_point()
ff <- ggplot(stats, aes(x=ID, y=stdev)) + geom_point()

library(gridExtra)
jpeg("Documents/robertslab/work/CpG/eastern-oyster/basicstats.jpeg")
grid.arrange(aa,bb,cc,dd,ee,ff, nrow = 3)
dev.off()

stress response stats

rownames(stress.cpg) <- stress.cpg$V1
stress.cpg <- stress.cpg[,-c(1:3)]

stress.stats <- data.frame(colMeans(stress.cpg))
colnames(stress.stats)[1] <- "mean"
stress.stats$median <- colMedians(as.matrix(stress.cpg))
stress.stats$skew <- colskewness(as.matrix(stress.cpg))
stress.stats$kurtosis <- colkurtosis(as.matrix(stress.cpg))
stress.stats$variance <- colVars(as.matrix(stress.cpg))
stress.stats$stdev <- colVars(as.matrix(stress.cpg), std = TRUE)

stress.stats$ID <- rownames(stress.stats)

a <- ggplot(stress.stats, aes(x=ID, y=mean)) + geom_point()
b <- ggplot(stress.stats, aes(x=ID, y=median)) + geom_point()
c <- ggplot(stress.stats, aes(x=ID, y=kurtosis)) + geom_point()
d <- ggplot(stress.stats, aes(x=ID, y=skew)) + geom_point()
e <- ggplot(stress.stats, aes(x=ID, y=variance)) + geom_point()
f <- ggplot(stress.stats, aes(x=ID, y=stdev)) + geom_point()

jpeg("Documents/robertslab/work/CpG/eastern-oyster/stress-basicstats.jpeg")
grid.arrange(a,b,c,d,e,f, nrow = 3)
dev.off()

stress response and all samples plotted together

jpeg("Documents/robertslab/work/CpG/eastern-oyster/all-v-stress.jpeg")
grid.arrange(aa,a,cc,c,dd,d,ee,e, nrow = 4)
dev.off()

table with mean of all samples (rather than mean of genes)

cpg <- data.frame(t(cpg))

all.stats <- data.frame(colMeans(cpg))
colnames(all.stats)[1] <- "mean"

all.stats$median <- colMedians(as.matrix(cpg))
all.stats$skew <- colskewness(as.matrix(cpg))
all.stats$kurtosis <- colkurtosis(as.matrix(cpg))
all.stats$variance <- colVars(as.matrix(cpg))
all.stats$stdev <- colVars(as.matrix(cpg), std = TRUE)

write.table(all.stats, "Documents/robertslab/work/CpG/eastern-oyster/stats-allsamples-on-samples.txt")

all.stats$ID <- rownames(all.stats)

g <- ggplot(all.stats, aes(x=ID, y=mean)) + geom_point()
h <- ggplot(all.stats, aes(x=ID, y=median)) + geom_point()
i <- ggplot(all.stats, aes(x=ID, y=kurtosis)) + geom_point()
j <- ggplot(all.stats, aes(x=ID, y=skew)) + geom_point()
k <- ggplot(all.stats, aes(x=ID, y=variance)) + geom_point()
l <- ggplot(all.stats, aes(x=ID, y=stdev)) + geom_point()

jpeg("Documents/robertslab/work/CpG/eastern-oyster/all-on_samples.jpeg")
grid.arrange(g,h,i,j,k,l, nrow = 3)
dev.off()

density, box and Swarm plots of stress reponse genes

library(ggbeeswarm)
theme_set(theme_light()) #sets a default ggplot theme

stress <- stress.cpg
rownames(stress) <- stress$V1
stress <- stress[,-c(1:3)]

stress$Gene <- rownames(stress)

library(tidyr)
test <- gather(stress, key = "Sample", value = "CpG", 1:(ncol(stress)-1))
test$pop <- test$Sample
test$pop <- gsub("_.*","",test$pop)

test <- data.frame(colMeans(stress[,c(1:90)]))
colnames(test) <- "CpG"
test$sample <- rownames(test)
test$pop <- test$sample
test$pop <- gsub("_.*","",test$pop)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/swarm_box.jpeg")
ggplot(test, aes(pop,CpG, col = pop)) + geom_beeswarm(size = 3, cex = 3) + geom_boxplot(aes(pop, CpG), width = 0.5, alpha = 0.1)
dev.off()

jpeg("Documents/robertslab/work/CpG/eastern-oyster/swarm.jpeg")
ggplot(test, aes(pop,CpG, col = pop)) + geom_beeswarm(size = 3, cex = 3)
dev.off()

jpeg("Documents/robertslab/work/CpG/eastern-oyster/density.jpeg")
ggplot(test) + geom_density(aes(CpG, color= pop))
dev.off()

jpeg("Documents/robertslab/work/CpG/eastern-oyster/faceted-density.jpeg")
ggplot(test) + geom_density(aes(CpG)) + facet_wrap(~pop, scale = "free")
dev.off()

jpeg("Documents/robertslab/work/CpG/eastern-oyster/box.jpeg")
ggplot(test) + geom_boxplot(aes(pop, CpG))
dev.off()

plot provided GO ID lists

GOcpg <- merge(GOBP.spread, cpg, by.x= "V1", by.y= "ID")

#list1
list1 <- list("GO:0100044",
"GO:0042538",
"GO:0042539",
"GO:1900464",
"GO:1900069",
"GO:1900070",
"GO:0071477",
"GO:0071475",
"GO:1902074",
"GO:1902075",
"GO:0009651",
"GO:1901000",
"GO:1901002",
"GO:1901001",
"GO:1901200",
"GO:1901196",
"GO:1901199",
"GO:0071472")

list1df <- data.frame()
for (i in 1:nrow(GOcpg)){
list <- GOcpg$V2[[i]] #save GO IDs in each line as a list
for(j in 1:length(list)){
if(list[j] %in% list1){
list1df <- rbind(list1df, GOcpg[i,]) #bind a matching row to the empty df
}
}
}

rownames(list1df) <- list1df$Gene
list1df <- list1df[,-c(2:3)]

list1df <- data.frame(colMeans(list1df[,c(2:91)]))
colnames(list1df) <- "CpG"
list1df$sample <- rownames(list1df)
list1df$pop <- list1df$sample
list1df$pop <- gsub("_.*","",list1df$sample)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/list1-swarmbox.jpeg")
ggplot(list1df, aes(pop,CpG, col = pop)) + geom_beeswarm(size = 3, cex = 3) + geom_boxplot(aes(pop, CpG), width = 0.5, alpha = 0.1)
dev.off()

#list 2
list2 <- list("GO:0070415",
"GO:0070414",
"GO:0090441",
"GO:0090442",
"GO:0061408",
"GO:1990845",
"GO:0031990",
"GO:0043620",
"GO:0043555",
"GO:1990611",
"GO:1990497",
"GO:0034605",
"GO:0070417",
"GO:0006950",
"GO:0036003",
"GO:0043618",
"GO:0080134",
"GO:0080135",
"GO:0097201",
"GO:0033554",
"GO:0033555",
"GO:0009271")

list2df <- data.frame()
for (i in 1:nrow(GOcpg)){
list <- GOcpg$V2[[i]]
for(j in 1:length(list)){
if(list[j] %in% list2){
list2df <- rbind(list2df, GOcpg[i,])
}
}
}

list2df <- unique(list2df)
rownames(list2df) <- list2df$V1
list2df <- list2df[,-c(2:3)]

list2df <- data.frame(colMeans(list2df[,c(2:91)]))
colnames(list2df) <- "CpG"
list2df$sample <- rownames(list2df)
list2df$pop <- list2df$sample
list2df$pop <- gsub("_.*","",list2df$pop)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/list2-swarmbox.jpeg")
ggplot(list2df, aes(pop,CpG, col = pop)) + geom_beeswarm(size = 3, cex = 3) + geom_boxplot(aes(pop, CpG), width = 0.5, alpha = 0.1)
dev.off()

#list 3
list3 <- list("GO:0045087",
"GO:0061760",
"GO:0045088",
"GO:0002218",
"GO:0002227",
"GO:1905036",
"GO:1905035",
"GO:1905034",
"GO:0035420",
"GO:0045824",
"GO:0045089",
"GO:0002758",
"GO:0002766",
"GO:0090714",
"GO:0035419",
"GO:0035423",
"GO:0035422",
"GO:0035421",
"GO:0046735",
"GO:0046738",
"GO:0038075",
"GO:0039503",
"GO:0052170",
"GO:0052166",
"GO:0052167",
"GO:0052390",
"GO:0052305",
"GO:0052309",
"GO:0002220",
"GO:0038078",
"GO:0052157",
"GO:0052169",
"GO:0052033",
"GO:0052034",
"GO:0052382",
"GO:0052306",
"GO:0052308",
"GO:0052258",
"GO:0052257",
"GO:0052296",
"GO:0035663",
"GO:0035662",
"GO:0090644",
"GO:0035325",
"GO:0039699",
"GO:0046696",
"GO:0002224",
"GO:0039539",
"GO:0039538",
"GO:0039537",
"GO:0039560",
"GO:1990231",
"GO:0002755",
"GO:0002756",
"GO:0035669",
"GO:0035668",
"GO:0035667",
"GO:0035666",
"GO:0035661",
"GO:0035660",
"GO:0035665",
"GO:0035664")

list3df <- data.frame()
for (i in 1:nrow(GOcpg)){
list <- GOcpg$V2[[i]]
for(j in 1:length(list)){
if(list[j] %in% list3){
list3df <- rbind(list3df, GOcpg[i,])
}
}
}

list3df <- unique(list3df)
rownames(list3df) <- list3df$V1
list3df <- list3df[,-c(2:3)]

list3df <- data.frame(colMeans(list3df[,c(2:91)]))
colnames(list3df) <- "CpG"
list3df$sample <- rownames(list3df)
list3df$pop <- list3df$sample
list3df$pop <- gsub("_.*","",list3df$pop)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/list3-swarmbox.jpeg")
ggplot(list3df, aes(pop,CpG, col = pop)) + geom_beeswarm(size = 3, cex = 3) + geom_boxplot(aes(pop, CpG), width = 0.5, alpha = 0.1)
dev.off()

qq plot on stress response genes by population

Gocpg

loop to get pop means per gene

pops <- unique(list1df$pop) #list of pop names
popmeans <- data.frame(matrix(0,19907,1)) #create empty matrix with dimensions b/c cbind sucks
for(i in (1:nrow(GOcpg))){ #for each row of GOcpg
for(j in 1:length(pops)){ # for item in the list (for each pop in list)
pattern <- paste0(pops[j],"_") # create a new variable and paste the pop item with an underscore to distinguish unique pops (ex CL and CLP)
popmeans[i,j] <- apply(GOcpg[i,grep(pattern,colnames(GOcpg))],1,mean) #fill in each new cell of matrix with the mean of indv populations for each gene
}
#print(popmeans[i,])
}
rownames(popmeans) <- rownames(GOcpg)
colnames(popmeans) <- pops

library(mixtools)

colname.list <- colnames(popmeans)

same plot

jpeg(filename = "Documents/robertslab/work/CpG/eastern-oyster/popmeans-qq-same.jpeg", width = 1000, height = 1000)

for(i in 1:length(colname.list)){
data <- popmeans[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = " ", main2 = colname.list[i], font.main = 3)
lines(density(filt.data), lty=2, lwd=2)
par(new = TRUE)
print(mean(popmeans[[i]]))
}

dev.off()

different plots

jpeg(filename = "Documents/robertslab/work/CpG/eastern-oyster/popmeans-qq-faceted.jpeg", width = 1000, height = 1000)

a <- par(mfrow = c(4, 4)) # r x c plot

for(i in 1:length(colname.list)){
data <- popmeans[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = " ", main2 = colname.list[i], font.main = 3)
lines(density(filt.data), lty=2, lwd=2)
}

dev.off()

Kruskel and Wilcox test

#anova <- gather(popmeans, key = "Pop", value = "CpG", 1:(ncol(popmeans)))
#anova$Pop <- as.factor(anova$Pop)

#kruskal.test(CpG ~ Pop, data = anova)

#pairwise.wilcox.test(anova$CpG, anova$Pop, p.adjust.method = "BH")

Mean plot

#library("ggpubr")
#ggline(popmeans, x = "Pop", y = "Cpg",
# add = c("mean_se", "jitter"),
#order = c("","",…),
#ylab = "CpG O/E", xlab = "Population")

Q-Q plots on provided lists

GO.popmeans <- merge(popmeans, GOBP.spread, by.x="row.names", by.y="V1")

list 1

l1df <- data.frame()
for (i in 1:nrow(GO.popmeans)){
list <- GO.popmeans$V2[[i]]
for(j in 1:length(list)){
if(list[j] %in% list1){
l1df <- rbind(l1df, GO.popmeans[i,])
}
}
}

l1df <- unique(l1df)
rownames(l1df) <- l1df$Row.names
l1df <- l1df[,-c(1,17:18)]

#faceted plots
colname.list <- colnames(l1df)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/list1-qq-faceted.jpeg", width = 1000, height = 1000)
a <- par(mfrow = c(4, 4)) # r x c plot
for(i in 1:length(colname.list)){
data <- l1df[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = "CpG O/E", main2 = colname.list[i], font.main = 3)
lines(density(filt.data), lty=2, lwd=2)
}
dev.off()

#same plot
jpeg("Documents/robertslab/work/CpG/eastern-oyster/list1-qq-same.jpeg", width = 1000, height = 1000)
for(i in 1:length(colname.list)){
data <- l1df[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2=" CpG O/E")
lines(density(filt.data), lty=2, lwd=2)
par(new = TRUE)
}
dev.off()

list 2

l2df <- data.frame()
for (i in 1:nrow(GO.popmeans)){
list <- GO.popmeans$V2[[i]]
for(j in 1:length(list)){
if(list[j] %in% list2){
l2df <- rbind(l2df, GO.popmeans[i,])
}
}
}

l2df <- unique(l2df)
rownames(l2df) <- l2df$Row.names
l2df <- l2df[,-c(1,17:18)]

#faceted plots
colname.list <- colnames(l2df)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/list2-qq-faceted.jpeg", width = 1000, height = 1000)

a <- par(mfrow = c(4, 4)) # r x c plot

for(i in 1:length(colname.list)){
data <- l2df[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = "CpG O/E", main2 = colname.list[i], font.main = 3)
lines(density(filt.data), lty=2, lwd=2)
}
dev.off()

#same plot
jpeg("Documents/robertslab/work/CpG/eastern-oyster/list2-qq-same.jpeg", width = 1000, height = 1000)
for(i in 1:length(colname.list)){
data <- l2df[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("blue", "red"), xlab2 = "CpG O/E ")
lines(density(filt.data), lty=2, lwd=2)
par(new = TRUE)
}
dev.off()

list 3

l3df <- data.frame()
for (i in 1:nrow(GO.popmeans)){
list <- GO.popmeans$V2[[i]]
for(j in 1:length(list)){
if(list[j] %in% list3){
l3df <- rbind(l3df, GO.popmeans[i,])
}
}
}

l3df <- unique(l3df)
rownames(l3df) <- l3df$Row.names
l3df <- l3df[,-c(1,17:18)]

#faceted plots
colname.list <- colnames(l3df)

jpeg("Documents/robertslab/work/CpG/eastern-oyster/list3-qq-faceted.jpeg", width = 1000, height = 1000)
a <- par(mfrow = c(4, 4)) # r x c plot
for(i in 1:length(colname.list)){
data <- l3df[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2 = "CpG O/E", main2 = colname.list[i], font.main = 3)
lines(density(filt.data), lty=2, lwd=2)
}
dev.off()

#same plot
jpeg("Documents/robertslab/work/CpG/eastern-oyster/list3-qq-same.jpeg", width = 1000, height = 1000)
for(i in 1:length(colname.list)){
data <- l3df[[i]]
filt.data = 0.001 & data <= 1.5] #Cutting off high and low values
set.seed(101)
mixmdl <- normalmixEM(filt.data, k=2)
plot(mixmdl, which = 2, col2 = c("red", "blue"), xlab2=" CpG O/E")
lines(density(filt.data), lty=2, lwd=2)
par(new = TRUE)
}
dev.off()