DML Analysis: How to get GOterms

Gene Set Enrichment Analysis Workflow:

  • Get Entrez Gene IDs
  • Match IDs with GOterms
  • Use both topGO and DAVID for enrichment

Problem:

  • The gene IDs found in the C. virginica GFF files are not official, NCBI Entrez Gene IDs. Not sure what LOC{} is, but XM_{} are Genbank IDs. Genbank IDs from the GFF were not recognized by DAVID

Solution:

  • blastx to get Uniprot accession codes and GOterms
  • Use Uniprot and GOterms in DAVID
  • Convert Uniprot accession codes to Entrez IDs
  • Use Entrez IDs and GOterms in DAVID

[code]#!/bin/bash ## Job Name #SBATCH...

#!/bin/bash
## Job Name
#SBATCH --job-name=angsd-maf
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes (We only get 1, so this is fixed)
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=10-100:00:00
## Memory per node
#SBATCH --mem=70G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=sr320@uw.edu
## Specify the working directory for this job
#SBATCH --workdir=/gscratch/srlab/sr320/analyses/0615b





source /gscratch/srlab/programs/scripts/paths.sh


/gscratch/srlab/sr320/programs/angsd/angsd \
-b /gscratch/srlab/sr320/data/cw/all_bam.bamlist \
-ref /gscratch/srlab/sr320/data/cw/chinook_genome_v1.fasta \
-out AllSamples_Maf1 \
-GL 1 \
-doMaf 1 \
-doMajorMinor 1 \
-minMaf 0.05 \
-SNP_pval 1e-6 \
-minInd 468 \
-minQ 20 \
-P 28 \
-setMinDepth 468 \
-setMaxDepth 10000 \
-doCounts 1 \
-doDepth 1 \
-dumpCounts 1

#sbatch

[code]#!/bin/bash ## Job Name #SBATCH...

#!/bin/bash
## Job Name
#SBATCH --job-name=angsd
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes (We only get 1, so this is fixed)
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=10-100:00:00
## Memory per node
#SBATCH --mem=400G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=sr320@uw.edu
## Specify the working directory for this job
#SBATCH --workdir=/gscratch/srlab/sr320/analyses/0615





source /gscratch/srlab/programs/scripts/paths.sh


/gscratch/srlab/sr320/programs/angsd/angsd \
-b /gscratch/srlab/sr320/data/cw/all_bam.bamlist \
-ref /gscratch/srlab/sr320/data/cw/chinook_genome_v1.fasta \
-out AllSamples_MinQ20 \
-GL 1 \
-doMaf 2 \
-doMajorMinor 1 \
-minMaf 0.05 \
-SNP_pval 1e-6 \
-minInd 468 \
-minQ 20 \
-P 28

#sbatch

[code][sr320@mox2 cw]$ scp sr320@eagle.fish.washington.edu:/var/services/web/Charlie_Waters/* .[/code]

[sr320@mox2 cw]$ scp sr320@eagle.fish.washington.edu:/var/services/web/Charlie_Waters/* .

[code]#!/bin/bash ## Job Name #SBATCH...

#!/bin/bash
## Job Name
#SBATCH --job-name=blastp
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes (We only get 1, so this is fixed)
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=10-100:00:00
## Memory per node
#SBATCH --mem=70G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=sr320@uw.edu
## Specify the working directory for this job
#SBATCH --workdir=/gscratch/srlab/sr320/analyses/0614





source /gscratch/srlab/programs/scripts/paths.sh


/gscratch/srlab/programs/ncbi-blast-2.6.0+/bin/blastp  \
-query /gscratch/srlab/sr320/query/GCF_002022765.2_C_virginica-3.0_protein.faa \
-db /gscratch/srlab/sr320/blastdb/uniprot_sprot_080917 \
-max_target_seqs 1 \
-evalue 1E-20 \
-outfmt 6 \
-num_threads 28 \
-out Cv_sprot.blastout

#sbatch