# use pacman to load libraries
pacman::p_load(DESeq2, tidyverse)
We already created the DESeq object in the previous script with the following code.
Our objective is to test whether the effect of dropping water level on tadpoles from plastic and less plastic populations. We therefore want to contrast High vs. Low water for each of the regions and so we have to include a region:treatment interaction effect. However, we are expecting region alone (genetic variations) to be an important factor affecting gene expression and we also want to correct for differences of having sampled different populations within each region.
In the experimental design, populations are treated as replicates per region, being numbered from 1 to 3 within each region. Otherwise, each population would be unique to each region and would return an error.
We also drop the intercept (first term in the formula is 0) to allow for easier comparison between contrasts. If not, the base level expression would be that of the intercept and we would have to re-level the baseline level for each comparison.
# load data
txi<-readRDS("salmon_gene_counts.rds")
# Remove mtDNA, non-coding and nr baits
# make a list of genes we want to keep
whitelist<-txi$counts %>%
as_tibble(rownames = "gene_id") %>%
filter(!str_detect(gene_id, pattern = "mt|nr|nc")) %>%
pull(gene_id)
length(whitelist);head(whitelist) # we are keeping 32531 genes
## [1] 32531
## [1] "PECUL23A000004" "PECUL23A000005" "PECUL23A000006" "PECUL23A000008"
## [5] "PECUL23A000011" "PECUL23A000012"
# filter txi tables
txi$abundance<-txi$abundance[whitelist,]
txi$counts<-txi$counts[whitelist,]
txi$length<-txi$length[whitelist,]
# load design matrix
des.mat<-read_csv("./design_matrix.csv")
## Rows: 48 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_id, treatment, population, region
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Re-order factor levels
des.mat <- des.mat %>%
mutate(population=factor(population, levels=c("Bui","Can","Tur","Esp","Jab","Lla"))) %>% # re-order factors for easy plotting later
mutate(pop_n = factor(rep(rep(1:3,each=8),2))) %>% # The new variable pop_n created in the code does not depend on the levels of the population factor and is created independently by repeating a sequence. Hence, pop_n is non-nested with respect to population.
mutate_if(is.character, as.factor) # convert characters to factor
# filter out samples
des.mat<-des.mat %>%
filter(!sample_id %in% c("Bui4H14_nonrrna","Jab5H6_nonrrna"))
# filter txi tables
txi$abundance<-txi$abundance[,as.character(des.mat$sample_id)]
txi$counts<-txi$counts[,as.character(des.mat$sample_id)]
txi$length<-txi$length[,as.character(des.mat$sample_id)]
# get column order of counts matrix and re-order des.mat to match
col_order<-match(colnames(txi$counts),des.mat$sample_id)
des.mat<-des.mat[col_order,]
des.mat$sample_id==colnames(txi$counts)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE
des.mat
## # A tibble: 46 × 5
## sample_id treatment population region pop_n
## <fct> <fct> <fct> <fct> <fct>
## 1 Bui1H9_nonrrna H Bui central 1
## 2 Bui1L9_nonrrna L Bui central 1
## 3 Bui2H12_nonrrna H Bui central 1
## 4 Bui2L1_nonrrna L Bui central 1
## 5 Bui3H10_nonrrna H Bui central 1
## 6 Bui3L8_nonrrna L Bui central 1
## 7 Bui4L4_nonrrna L Bui central 1
## 8 Can1H4_nonrrna H Can central 2
## 9 Can1L1_nonrrna L Can central 2
## 10 Can2H12_nonrrna H Can central 2
## # ℹ 36 more rows
# dds Object
dds <- DESeqDataSetFromTximport(txi,
colData = des.mat,
design = ~0+region + region:pop_n + region:treatment)
## using counts and average transcript lengths from tximport
# Making sure metadata is in matching order
des.mat$sample_id==colnames(assay(dds))
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE
And applied a mild pre-filtering step:
dds1<-dds[rowSums(counts(dds) >= 1) >= 12,] # genes that have at least 1 count for 12 samples (e.g. one per treatment per population)
dim(dds1)
## [1] 17987 46
We will now use the DESeq function to run the DESeq2 pipeline. This pipeline consists of the following steps:
Use the estimateSizeFactors function to calculate normalization factors for each sample. This accounts for differences in sequencing depth and RNA composition.
Estimate the dispersion parameter for each gene. The dispersion reflects the variance of counts for a gene beyond what is expected under a Poisson model. The dispersions are fitted to improve accuracy, especially for lowly expressed genes.
Fit a negative binomial generalized linear model (GLM). Either using Wald Test (default) or Likelihood Ratio Test.
Apply independent filtering to remove genes with low mean counts that are unlikely to be significant. This increases the power of the tests by reducing the number of tests performed.
Use the Benjamini-Hochberg procedure or other methods to adjust p-values for multiple testing. This controls the FDR, ensuring that the proportion of false positives among the declared significant results is low.
Use the results function to extract the differential expression results, including log2 fold changes, p-values, and adjusted p-values.
dds1 <- DESeq(dds1)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds1)
## [1] "regioncentral" "regionsouth"
## [3] "regioncentral.pop_n2" "regionsouth.pop_n2"
## [5] "regioncentral.pop_n3" "regionsouth.pop_n3"
## [7] "regioncentral.treatmentL" "regionsouth.treatmentL"
# Comparing Low vs High (high is the reference level for treatment) in the central region
res_treat_central<-results(dds1, name="regioncentral.treatmentL")
summary(res_treat_central, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 88, 0.49%
## LFC < 0 (down) : 5, 0.028%
## outliers [1] : 124, 0.69%
## low counts [2] : 3104, 17%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Comparing Low vs High (high is the reference level for treatment) in the southern region
res_treat_south<-results(dds1, name="regionsouth.treatmentL")
summary(res_treat_south, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 443, 2.5%
## LFC < 0 (down) : 540, 3%
## outliers [1] : 124, 0.69%
## low counts [2] : 3445, 19%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Combining them into a list
res_dds<-list("central"=res_treat_central,
"south"=res_treat_south)
We can plot the number of DEGs.
# plot
res_dds %>%
lapply(as_tibble,rownames = "gene_id") %>%
bind_rows(.id="population") %>%
drop_na(padj) %>% # drop all genes with NAs
filter(padj<0.1) %>%
mutate(updown=ifelse(log2FoldChange>0, "up", "down")) %>%
group_by(population, updown) %>%
summarise(n=n()) %>%
mutate(n=ifelse(updown=="down", n*-1, n)) %>%
ggplot(aes(x=population, y=n, fill=updown)) +
geom_bar(stat="identity") +
theme_bw() +
theme(legend.position = "none")
## `summarise()` has grouped output by 'population'. You can override using the
## `.groups` argument.
Let’s look at the p-value distributions
par(mfrow=c(2,1))
for(i in 1:length(res_dds)){
hist(res_dds[[i]]$pvalue, breaks=40, col="grey", main=names(res_dds)[i])
}
par(mfrow=c(1,1))
The distribution should be evenly distributed with an inflation of p-values=0. Common “bad” distributions include U-shaped or hill shaped. Therefore, our distribution looks relatively normal.
We try using a local fit instead of a parametric fit (default) for the dispersion. DESeq dispersion quantifies the within-group variability of gene expression. We will plot the dispersion estimates for both the parametric and local fits to visually inspect the differences, and calculate the median of the absolute residuals (difference between observed and fitted dispersions) for both fits to quantitatively compare the fits. Lower residuals indicate a better fit.
# Visually inspect the dispersion of a (default) parametric fit vs a local fit
disp.par <- estimateDispersions(dds1, fitType = "parametric")
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
disp.loc <- estimateDispersions(dds1, fitType = "local")
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
par(mfrow=c(2,1))
plotDispEsts(disp.par, main= "dispEst: parametric")
plotDispEsts(disp.loc, main= "dispEst: local")
par(mfrow=c(1,1))
# Calculate median of absolute residuals
median(abs(log(mcols(disp.par)$dispGeneEst) - log(mcols(disp.par)$dispFit)))
## [1] 1.020536
median(abs(log(mcols(disp.loc)$dispGeneEst) - log(mcols(disp.loc)$dispFit)))
## [1] 0.7595209
# Local fit presents lower mean absolute residuals, suggesting that that the distance of the residuals to the best fit line is lower.
We calculate the new results using local fit type
dds.loc<-DESeq(dds1, fitType = "local")
## using pre-existing normalization factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds.loc)
## [1] "regioncentral" "regionsouth"
## [3] "regioncentral.pop_n2" "regionsouth.pop_n2"
## [5] "regioncentral.pop_n3" "regionsouth.pop_n3"
## [7] "regioncentral.treatmentL" "regionsouth.treatmentL"
# compares Low vs High (high is the reference level for treatment) in the central region
res_loc_treat_central<-results(dds.loc, name="regioncentral.treatmentL")
summary(res_loc_treat_central, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 93, 0.52%
## LFC < 0 (down) : 6, 0.033%
## outliers [1] : 124, 0.69%
## low counts [2] : 2080, 12%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# compares Low vs High (high is the reference level for treatment) in the central region
res_loc_treat_south<-results(dds.loc, name="regionsouth.treatmentL")
summary(res_loc_treat_south, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 478, 2.7%
## LFC < 0 (down) : 562, 3.1%
## outliers [1] : 124, 0.69%
## low counts [2] : 3104, 17%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Combining them into a list
res_dds_loc<-list("central"=res_loc_treat_central,
"south"=res_loc_treat_south)
par(mfrow=c(2,1))
for(i in 1:length(res_dds_loc)){
hist(res_dds_loc[[i]]$pvalue, breaks=40, col="grey", main=names(res_dds_loc)[i])
}
par(mfrow=c(1,1))
# plot
res_dds_loc %>%
lapply(as_tibble,rownames = "gene_id") %>%
bind_rows(.id="population") %>%
drop_na(padj) %>% # drop all genes with NAs
filter(padj<0.1) %>%
mutate(updown=ifelse(log2FoldChange>0, "up", "down")) %>%
group_by(population, updown) %>%
summarise(n=n()) %>%
mutate(n=ifelse(updown=="down", n*-1, n)) %>%
ggplot(aes(x=population, y=n, fill=updown)) +
geom_bar(stat="identity") +
theme_bw() +
theme(legend.position = "none")
## `summarise()` has grouped output by 'population'. You can override using the
## `.groups` argument.
Lets export the results with the default null distribution and only light pre-filtering
# Make a results folder if it does not yet exist
dir.create("results", showWarnings = FALSE)
# Save DESeq2 object
saveRDS(dds1, "./results/deseq2_regions_dds.rds")
saveRDS(dds.loc, "./results/deseq2_regions_local.rds")
# Save results object
saveRDS(res_dds,
"./results/deseq2_regions_results.rds")
saveRDS(res_dds,
"./results/deseq2_regions_results_local.rds")