Differential expression analysis of absolute proteomics

Overview

In this section a limma-piano workflow for analyzing absolute proteomics data (TMT-iBAQ) from three S. cerevisiae strains: a Crabtree Positive, CENPK113-11C, used as reference; and two Crabtree Negative strains, sZJD23, and sZJD28.

Setup work-space

To run this pipeline the following packages are needed:

"tidyverse", "ggalluvial", "ggrepel", "statmod", "limma", "piano", "patchwork"

Additionally, a function will be needed to facilitate the analysis.

Code

case_de_type <- function(logFC, fdr) {
  
  if (logFC < 0 & fdr <= 0.01) {
    "down_hs"
  } else if (logFC < 0 & fdr <= 0.05 & fdr > 0.01) {
    "down_ls"
  } else if (logFC > 0 & fdr <= 0.01) {
    "up_hs"
  } else if (logFC > 0 & fdr <= 0.05 & fdr > 0.01) {
    "up_ls"
  } else {
    "ns"
  }
}

add_de_count <- function(x, y, FC = 1, ls = 0.05, hs = 0.01) {

#' add_de_count
#'
#' @param x a summary dataframe of GSA results
#' @param y a dataframe with differential expression results
#' @param FC a log2FC cutoff. Default 1
#' @param ls a pval to be low significant. Default 0.05
#' @param hs a pval to be high significant 
#'
#' @return a dataframe with columns with count and genes
#' @export
#'
#' @examples
  
  x <- x |> 
    add_column(genes_up_hs = "",
               count_up_hs = 0,
               genes_up_ls = "",
               count_up_ls = 0,
               genes_ns = "",
               count_ns = 0,
               genes_down_ls = "",
               count_down_ls = 0,
               genes_down_hs = "",
               count_down_hs = 0)
  
  for (i in 1:nrow(x)) {
    
    genes <- str_split(x$genes[i], ", ")
    
    for (j in genes[[1]]) {
      
      idx <- which(j == y$GeneName)
      
      if (y$logFC[idx] < FC & y$fdr[idx] <= hs) {
        x$genes_down_hs[i] <- if_else(x$genes_down_hs[i] != "",
                                      str_c(x$genes_down_hs[i], ", ", j),
                                      j)   
        x$count_down_hs[i] <- x$count_down_hs[i] + 1
      } else if (y$logFC[idx] < FC & y$fdr[idx] <= ls & y$fdr[idx] > hs) {
        x$genes_down_ls[i] <- if_else(x$genes_down_ls[i] != "",
                                      str_c(x$genes_down_ls[i], ", ", j),
                                      j)
        x$count_down_ls[i] <- x$count_down_ls[i] + 1
      } else if (y$logFC[idx] > FC & y$fdr[idx] <= hs) {
        x$genes_up_hs[i] <- if_else(x$genes_up_hs[i] != "",
                                    str_c(x$genes_up_hs[i], ", ", j),
                                    j)
        x$count_up_hs[i] <- x$count_up_hs[i] + 1
      } else if (y$logFC[idx] > FC & y$fdr[idx] <= ls & y$fdr[idx] > hs) {
        x$genes_up_ls[i] <- if_else(x$genes_up_ls[i] != "",
                                    str_c(x$genes_up_ls[i], ", ", j),
                                    j)
        x$count_up_ls[i] <- x$count_up_ls[i] + 1
      } else {
        x$genes_ns[i] <- if_else(x$genes_ns[i] != "",
                                 str_c(x$genes_ns[i], ", ", j),
                                 j)
        x$count_ns[i] <- x$count_ns[i] + 1
      }
    }
  }
  return(x)
}

Data

For this analysis the following data is needed:

Abundance proteomics, the dataset used here is in [fmol/ug Protein]
Total protein content in the cell, dataset is in [g/g Protein]
GO terms for the genes in S. cerevisiae. This will be used for the Gene Set Analysis.

Note

ensembl data can be retrieve using biomaRt package, however, sometimes and due to internet speed it presents problems to download the data. To avoid this, alternatively, database for S. cerevisiae can be retrieve using the wget UNIX command. Ouput file can be loaded as a normal tsv file.

Expand to see command in r

go_database <- biomaRt::getBM(
        attributes = c("ensembl_gene_id",
                       "uniprotswissprot",
                       "go_id",
                       "name_1006",
                       "namespace_1003"),
        mart = biomaRt::useDataset("scerevisiae_gene_ensembl",
                                   biomaRt::useMart("ensembl")))

Expand to see the UNIX command

wget -O ensembl.txt 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "scerevisiae_gene_ensembl" interface = "default" ><Attribute name = "ensembl_gene_id" /><Attribute name = "uniprotswissprot" /><Attribute name = "go_id" /><Attribute name = "name_1006" /><Attribute name = "namespace_1003" /></Dataset></Query>'

Prepare raw data

As mentioned, our data is in [fmol/ug Protein]. (i) To be consistent with the second stage of this study (Genome Scale Modeling), where [mg/gDW] will be used, (ii) to make comparable analysis bewteen strains, and (iii) to avoid false detection of differential expressed proteins due to low numbers, let’s transform the data to [ug/gDW].

Code

proteomics_ug <- abundance |>
    rowwise() |>
    # 1. Converte to ug/gDW.
    # Convert fmol/ug -> mol/ug and Mw from kDa (kg/mol) -> ug/mol
    # Convert to ug/gDW
    mutate(CENPK113_11C_1 = 
               ((abundance_WT_1 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl1[1] * 10^6,
           CENPK113_11C_2 = 
               ((abundance_WT_2 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl2[1] * 10^6,
           CENPK113_11C_3 = 
               ((abundance_WT_3 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl3[1] * 10^6,
           sZJD23_1 = 
               ((abundance_sZJD23_1 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl1[2] * 10^6,
           sZJD23_2 = 
               ((abundance_sZJD23_2 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl2[2] * 10^6,
           sZJD23_3 = 
               ((abundance_sZJD23_3 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl3[2] * 10^6,
           sZJD28_1 = 
               ((abundance_sZJD28_1 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl1[3] * 10^6,
           sZJD28_2 = 
               ((abundance_sZJD28_2 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl2[3] * 10^6,
           sZJD28_3 = 
               ((abundance_sZJD28_3 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl3[3] * 10^6) |>
    select(1:4, 29:37)

Accession	Description	PrimaryGeneName	GeneName	CENPK113_11C_1	CENPK113_11C_2	CENPK113_11C_3	sZJD23_1	sZJD23_2	sZJD23_3	sZJD28_1	sZJD28_2	sZJD28_3
Q8ZND6	Phosphate acetyltransferase OS=Salmonella enterica OX=28901 GN=pta PE=3 SV=1	PTA	STM2338	1304.509289	1477.844445	1172.653874	5706.989451	6347.537860	4680.379184	8502.407069	6850.306910	10277.648572
D4YIP5	Pyruvate oxidase OS=Aerococcus viridans ATCC 11563 = CCUG 4311 OX=655812 GN=spxB PE=3 SV=1	SPXB	AWM76_03655	67.616249	70.265165	57.486100	778.253249	834.871350	684.528932	534.097458	476.027418	645.899803
D6VTK4	Pheromone alpha factor receptor OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=STE2 PE=1 SV=2	STE2	YFL026W	73.917485	70.141567	54.239325	15.311769	15.511993	12.565787	23.455898	15.473736	18.805507
D6W196	Truncated non-functional calcium-binding mitochondrial carrier SAL1-1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=SAL1 PE=1 SV=2	SAL1	YNL083W	6.106214	7.512049	5.811902	10.508809	11.961704	9.797355	13.072501	11.552796	14.070033
O13297	mRNA-capping enzyme subunit beta OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=CET1 PE=1 SV=2	CET1	YPL228W	13.407504	15.572914	12.658895	15.259795	15.874536	12.709740	18.052200	15.676924	20.328332
O13329	DNA replication fork-blocking protein FOB1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=FOB1 PE=1 SV=2	FOB1	YDR110W	13.157625	13.891989	10.567888	7.149232	7.173132	5.407684	9.138712	7.620268	10.671553
O13539	THO complex subunit THP2 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=THP2 PE=1 SV=1	THP2	YHR167W	4.908577	4.752081	3.626766	3.034308	3.019418	2.340897	2.086474	2.196449	2.885695
O13563	26S proteasome regulatory subunit RPN13 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=RPN13 PE=1 SV=1	RPN13	YLR421C	11.777894	13.532398	10.409479	5.060106	5.205230	4.484698	8.257870	7.609551	10.792270
O13585	Dilute domain-containing protein YPR089W OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=YPR089W PE=1 SV=2	unknown	YPR089W	12.349842	13.716000	11.134884	9.054286	11.288863	8.827652	14.887530	12.178191	16.359752
O14455	60S ribosomal protein L36-B OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=RPL36B PE=1 SV=3	RPL36B	YPL249C-A	2126.737210	2120.527362	1787.688821	207.961355	63.520198	51.189917	74.642963	66.429517	92.617188

Explore the raw data

First, let’s take at look at the summary of the raw proteomics data. This provides a basic description of these (min and max values, median, mean, and quantiles).

Code

proteomics_ug |>
    group_by() |> 
    reframe(across(CENPK113_11C_1:sZJD28_3, 
                     ~ c(min = min(.x),
                         max = max(.x), 
                         mean = mean(.x), 
                         median = median(.x),
                         qs_01 = quantile(.x, 0.01),
                         qs_25 = quantile(.x, 0.25),
                         qs_75 = quantile(.x, 0.75)))) |> 
    add_column(
        stats = c("min.", "max.", "mean", "median", "10%", "25%", "75%"),
        .before = "CENPK113_11C_1") |> 
    kbl(digits = 2) |> 
    kable_styling() |> 
    scroll_box(width = "100%")

stats	CENPK113_11C_1	CENPK113_11C_2	CENPK113_11C_3	sZJD23_1	sZJD23_2	sZJD23_3	sZJD28_1	sZJD28_2	sZJD28_3
min.	0.01	0.02	0.02	0.02	0.01	0.00	0.01	0.02	0.03
max.	19957.58	17048.99	14613.77	12305.95	13291.44	10491.31	34795.63	30255.15	42096.73
mean	129.22	133.55	106.91	103.75	108.14	85.38	104.03	88.76	117.20
median	13.28	14.26	11.49	12.18	12.70	9.93	13.19	11.47	14.98
10%	0.26	0.33	0.26	0.13	0.12	0.08	0.18	0.16	0.21
25%	4.74	5.25	4.18	4.08	4.18	3.24	4.71	4.03	5.39
75%	41.73	45.53	35.91	36.30	37.76	30.05	39.27	33.57	43.48

We will use box plot to summarize distributions, and distribution density plot to visualize the log transformed abundances. If the data do not have well aligned box plot, then normalization is required.

Code

proteomics_ug |> 
    select(5:13) |> 
    gather(variable, value) |> 
    ggplot(aes(x = str_sub(variable, start = -1),
               y = log2(value),
               color = str_sub(variable, end = -3))) +
    geom_boxplot(outlier.shape = 1, outlier.alpha = 0.5) +
    scale_color_manual(values = color_strain) +
    labs(x = "Replicates",
         y = expr(log["2"] ~ "[ug/gDW]"),
         color = "Strains") +
    theme_minimal() +
    theme(legend.position = "bottom")

Code

proteomics_ug |> 
    select(5:13) |> 
    gather(variable, value) |> 
    ggplot(aes(x = log2(value),
               group = variable,
               color = str_sub(variable, end = -3))) +
    geom_density() +
    scale_color_manual(values = color_strain) +
    labs(x = expr(log["2"] ~ "[ug/gDW]"),
         y = "Density",
         color = "Strains") +
    theme_minimal() +
    theme(legend.position = "bottom")

The main variability is expected to come from biological differences between samples. One way of visualizing is to look at the first principal components of the PCA, then, is expected to separate samples from the different strains. Let´s visualize the experiment variability of the strains to be analyzed.

Code

pca_prot <- proteomics_ug |> 
    select(5:13) |> 
    t() |>  # we want to analyze the experiments
    prcomp()

# calculate the percentage for each PC
pca_prot$pct <- round(pca_prot$sdev^2 / sum(pca_prot$sdev^2) * 100, 2)

pca_prot$x |> 
    as_tibble() |> 
    add_column(sample = colnames(proteomics_ug |>  select(5:13))) |> 
    ggplot(aes(x = PC1, y = PC2)) +
    geom_point(aes(color = str_sub(sample, end = -3)),
               size = 4,
               alpha = 0.9) +
    geom_text(aes(label = str_sub(sample, start = -1)),
              vjust = -0.2,
              hjust = -1) +
    scale_color_manual(values = color_strain) +
    geom_hline(yintercept = 0, linetype = 2, alpha = 0.5) +
    geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
    labs(x = str_glue("PC1 ({pca_prot$pct[1]}%)"),
         y = str_glue("PC2 ({pca_prot$pct[2]}%)"),
         color = "Strains") +
    theme_minimal()

PCA of absolute proteomics quantification (raw values)

Normalization

In order to make the data comparable between replicates and strain, it is important to normalize the data. Here, quantile normalization will be used

Code

data_norm <- normalizeBetweenArrays(proteomics_ug |> 
                                      select(1, 5:13) |> 
                                      column_to_rownames(var = "Accession"),
                                    method = "quantile") |> 
  as_tibble(rownames = NA) |> 
  rownames_to_column(var = "Accession")

In low numbers, it can be notice that a comparison between samples or strains can produce false or not probably differential expression. Then, let’s filter low numbers. 0.19 is selected as the lowest 10% data values.

Using limma package for proteomics is based on the idea of treat proteomics data as micro-array data. For this, is recommended to use log-transformed values.

Code

data_filtered <- data_norm |> 
  column_to_rownames(var = "Accession") |> 
  filter(rowSums(across(CENPK113_11C_1:sZJD28_3) >= 0.19) >= 6) |> 
  rownames_to_column(var = "Accession") |> 
  rowwise() |> 
  mutate(CENPK113_11C_1 = log2(CENPK113_11C_1),
         CENPK113_11C_2 = log2(CENPK113_11C_2),
         CENPK113_11C_3 = log2(CENPK113_11C_3),
         sZJD23_1 = log2(sZJD23_1),
         sZJD23_2 = log2(sZJD23_2),
         sZJD23_3 = log2(sZJD23_3),
         sZJD28_1 = log2(sZJD28_1),
         sZJD28_2 = log2(sZJD28_2),
         sZJD28_3 = log2(sZJD28_3))

Again, let’s visualise the data, now, normalized log2 values

Code

data_filtered |> 
    select(2:10) |> 
    gather(variable, value) |> 
    ggplot(aes(x = str_sub(variable, start = -1),
               y = value,
               color = str_sub(variable, end = -3))) +
    geom_boxplot(outlier.shape = 1,
                 outlier.alpha = 0.5,
                 alpha = 0.9) +
    scale_color_manual(values = color_strain) +
    labs(x = "Replicates",
         y = expr(log["2"] ~ "[ug/gDW]"),
         color = "Strain") +
    theme_minimal() +
    theme(legend.position = "bottom",
          text = element_text(size = 10))

Code

data_filtered |> 
    select(2:10) |> 
    gather(variable, value) |> 
    ggplot(aes(x = value,
               group = variable,
               color = str_sub(variable, end = -3))) +
    geom_density() +
    scale_color_manual(values = color_strain) +
    labs(x = expr(log["2"] ~ "[mmol/gDW]"),
         y = "Density",
         color = "Strains") +
    theme_minimal() +
    theme(legend.position = "bottom",
          text = element_text(size = 10))

Differential analysis by using limma

To determine differential expressed proteins, here Limma package is used. In order to do that, we need to define the design matrix. We will use CENPK113 11C as reference, and compare it vs sZJD23 and sZJD28.

Code

group <- str_sub(colnames(data_filtered |>  select(2:10)), end = -3)

design <- model.matrix(~group, ref = "CENPK113_11C")

Now we can perform the differential expression pipeline and summary the results. First, do the linear model fitting. Then, do the empirical Bayes moderation of the test statistic. Finally, let’s see a summary of the results. Summary results shows Up regulated those with a log2FC higher than 0.25. In contrast, down regulated proteins are those with log2FC lower than 0.25

Code

fit <- lmFit(data_filtered, design)

fit <- eBayes(fit, robust = TRUE)

fit <- treat(fit, lfc = 0.25)

Once we have the results from limma, we need to make this information human-readable as well as, add some information to facilitate the analysis. So, Let’s take the information in topTable, then we can get the data to plot candidates.

Code

de_sZJD23 <- abundance |>  
  # Select the id information
  select(1:4) |> 
  # Add filtered data
  inner_join(data_filtered |> select(1:7),
             by = "Accession") |> 
  # Add limma results
  inner_join(topTable(fit,
                      coef = "groupsZJD23",
                      adjust = "BH",
                      sort.by = "none", # Not sort to add the protein id, etc
                      number = 10000),
             by = "Accession") |> 
  rename(pVal = P.Value, fdr = adj.P.Val)

# Repeat for sZJD28
de_sZJD28 <- abundance |>  
  select(1:4) |> 
  inner_join(data_filtered |> select(1:7),
             by = "Accession") |> 
  inner_join(topTable(fit,
                      coef = "groupsZJD28",
                      adjust = "BH",
                      sort.by = "none",
                      number = 10000),
             by = "Accession") |> 
  rename(pVal = P.Value, fdr = adj.P.Val)

# Create a summary data frame containing both condition data, and add a new column that helps in plotting
de_summary <- inner_join(x = de_sZJD23 |>  select(1:4, 11, 14:15),
                         y = de_sZJD28 |>  select(1:4, 11, 14:15),
                         by = c("Accession", "GeneName",
                                "Description", "PrimaryGeneName"),
                         suffix = c("_sZJD23", "_sZJD28")) |> 
    rowwise() |> 
    mutate(euclidean = sqrt(sum((logFC_sZJD23 - logFC_sZJD28)^2)),
        de_sZJD23 = case_de_type(logFC_sZJD23, fdr_sZJD23),
        de_sZJD28 = case_de_type(logFC_sZJD28, fdr_sZJD28),
        changes = str_c(de_sZJD23, "_sZJD23 & ", de_sZJD28, "_sZJD28"),
        # Don’t take into account high or low significance in changes
        changes = str_remove_all(changes, "ls_|hs_"))

Let’s visualize the DE results. \(fold~change=\frac{test}{ref}\). test, are sZJD23 or sZJD28, and ref is CENPK113_11C

Visualize the frequency of log2 fold changes and distributions of raw p-values computed by the statistical test for the comparisons done. This distribution is expected to be a peak around 0 corresponding to the differential expressed features.

Code

de_summary |> 
    ggplot(aes(logFC_sZJD23)) +
    geom_histogram(binwidth = 0.5, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0,
               linetype = 2,
               linewidth = 1,
               color = color_strain[2]) +
    labs(x = expr(log["2"] ~ "Fold change"),
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

de_summary |> 
    ggplot(aes(pVal_sZJD23)) +
    geom_histogram(binwidth = 0.05, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0.05,
               linetype = 2,
               linewidth = 1,
               color = color_strain[2]) +
    labs(x = "Raw p-value",
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

sZJD23 vs CENPK113_11C

Code

de_summary |> 
    ggplot(aes(logFC_sZJD28)) +
    geom_histogram(binwidth = 0.5, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0,
               linetype = 2,
               linewidth = 1,
               color = color_strain[3]) +
    labs(x = expr(log["2"] ~ "Fold change"),
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

de_summary |> 
    ggplot(aes(pVal_sZJD28)) +
    geom_histogram(binwidth = 0.05, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0.05,
               linetype = 2,
               linewidth = 1,
               color = color_strain[3]) +
    labs(x = "Raw p-value",
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

sZJD23 vs CENPK113_11C

Finally, shows the volcano plots for the comparisons performed. Differentially expressed features are still highlighted in blue and red. The volcano plot represents the log of the adjusted P value as a function of the log ratio of differential expression

Code

de_summary |> 
  ggplot(aes(x = logFC_sZJD23,
             y = -log10(fdr_sZJD23),
             color = de_sZJD23)) +
  geom_point(size = 2) +
  scale_color_manual(values = color_de,
                     labels = c("Down Hs", 
                                "Down Ls", 
                                "Ns", 
                                "Up Ls", 
                                "Up Hs")) +
  geom_hline(yintercept = 1.3, linetype = 2, alpha = 0.5) +
  geom_hline(yintercept = 2, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = 0.5, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = -0.5, linetype = 2, alpha = 0.5) +
  labs(x = expr(log["2"] ~ "Fold change"),
       y = expr(-log["10"] ~ FDR),
       caption = str_glue("Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n", 
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(legend.title = element_blank(),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        text = element_text(size = 10))

de_summary |> 
  ggplot(aes(x = logFC_sZJD28,
             y = -log10(fdr_sZJD28),
             color = de_sZJD28)) +
  geom_point(size = 2) +
  scale_color_manual(values = color_de,
                     labels = c("Down Hs", 
                                "Down Ls", 
                                "Ns", 
                                "Up Ls", 
                                "Up Hs")) +
  geom_hline(yintercept = 1.3, linetype = 2, alpha = 0.5) +
  geom_hline(yintercept = 2, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = 0.5, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = -0.5, linetype = 2, alpha = 0.5) +
  labs(x = expr(log["2"] ~ "Fold change"),
       y = expr(-log["10"] ~ FDR),
       caption = str_glue('Ls: Low significant (0.01 > fdr \u2264 0.05)\n',
                          "Hs: High significant (fdr \u2264 0.01)\n", 
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(legend.title = element_blank(),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        text = element_text(size = 10))

Volcano plot

Since we had two strains that we want to compare using the same as reference. So, we can visualise better how genes are expressed between strains. For that, let’s create an alluvial plot.

Code

de_summary |> 
  select(12:14) |> 
  gather(strain, de, -changes) |> 
  mutate(strain = str_remove(strain, "de_")) |> 
  group_by(strain, de, changes) |> 
  summarise(n = n()) |> 
  mutate(group = de) |> 
  to_lodes_form(axes = 1:3, key = "de") |> 
  mutate(stratum = factor(stratum, 
                          levels = c(names(color_strain[2:3]),
                                     names(color_de),
                                     names(color_changes)))) |> 
  ggplot(aes(x = de,
             y = n,
             stratum = stratum, 
             alluvium = alluvium,
             fill = group)) +
  geom_flow(width = 0.4, alpha = 0.2) +
  geom_stratum(stat = "stratum",
               aes(fill = after_stat(stratum), 
                   color = "#FFFFFF",
                   alpha = after_stat(stratum))) +
  scale_fill_manual(values = c(color_strain[2:3],
                                        color_de,
                                        color_changes)) +
  scale_color_manual(values = "#FFFFFF") + 
  scale_alpha_manual("stratum", values = c(0.7, 0.7, rep(1, 14))) +
  geom_text(stat = "stratum",
            aes(label = after_stat(c("sZJD28",
                                     "sZJD23",
                                     "Up Hs",
                                     "Up Ls",
                                     "Ns",
                                     "Down Ls",
                                     "Down Hs",
                                     "Ns sZJD23 & Up sZJD28",
                                     "Up sZJD23 & Ns sZJD28",
                                     "Up sZJD23 & Up sZJD28",
                                     "Up sZJD23 & Down sZJD28",
                                     "Ns sZJD23 & Ns sZJD28",
                                     "Down sZJD23 & Up sZJD28",
                                     "Down sZJD23 & Down sZJD28",
                                     "Ns sZJD23 & Down sZJD28",
                                     "Down sZJD23 & Ns sZJD28"))),
            hjust = c(rep("center", 7), rep("left", 9)),
            nudge_x = c(rep(0, 7), rep(0.2, 9)),
            color = rep("#000000", 16),
            size = rep(2, 16),
            fontface = "bold") +
  geom_text(stat = "stratum", 
            aes(label = after_stat(ifelse(str_detect(stratum, " & "), count/2, count))),
            nudge_x = -0.2,
            hjust = "right",
            size = 2)  +
  scale_x_discrete(labels = c("strain" = "Strain",
                              "de" = "DE",
                              "changes" = "Changes"),
                   expand = c(0.02, 0.25, 0, 0.75)) +
  labs(x = "",
       fill = "",
       color = "",
       tag = "A.",
       caption = str_glue("Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n",
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(legend.position = "none",
        panel.grid.major.y = element_blank(),
        axis.title.y = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

Differential expression flow between strains

Code

p2_a <- last_plot() + theme(text = element_text(size = 8))

Let´s visualize the log2 fold change of sZJD23 vs sZJD28, adding the Euclidean distance between the differential expression

Code

de_summary |> 
  ggplot(aes(x = logFC_sZJD23,
             y = logFC_sZJD28,
             color = euclidean,
             size = euclidean,
             alpha = euclidean)) +
  geom_point() +
  geom_text_repel(aes(label = ifelse(euclidean > 2, PrimaryGeneName, "")),
                  size = 2,
                  color = "#000000",
                  segment.color = "#C5C5C5",
                  max.overlaps = Inf) +
  scale_alpha(range = c(0.1, 1)) + 
  scale_color_gradient(low = "#003A59",
                       high = "#FF5800",
                       guide = "legend") +
  labs(x = expr(log[2] ~ "Fold change sZJD23"),
       y = expr(log[2] ~ "Fold change sZJD28"),
       color = "Euclidean",
       size = "Euclidean",
       alpha = "Euclidean",
       tag = "B.") +
  theme_minimal() +
  theme(plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

Code

# Store the plot to be save
p2_b <- last_plot() + theme(text = element_text(size = 8))

GO analysis by using piano

For the Gene Set Analysis (GSA) will we use piano and the ensembl database of GO terms for S. cerevisiae. But, sZJD23 and sZJD28 are engineered strains containing non-native genes in yeast, so, add information about the genes (pyruvate oxidase and pta) integrated to Crabtree negative strains from quick go.

Code

go_database <- go_database |> 
    add_row(ensembl_gene_id = rep("PTA", 6),
            uniprotswissprot = rep("Q8ZND6", 6),
            go_id = c("GO:0016407",
                      "GO:0016746",
                      "GO:0008959",
                      "GO:0016740",
                      "GO:0005737",
                      "GO:0006085"),
            name_1006 = c("acetyltransferase activity",
                          "acyltransferase activity",
                          "phosphate acetyltransferase activity",
                          "transferase activity",
                          "cytoplasm",
                          "acetyl-CoA biosynthetic process"),
            namespace_1003 = c("molecular_function",
                               "molecular_function",
                               "molecular_function",
                               "molecular_function",
                               "cellular_component",
                               "biological_process")) |> 
    add_row(ensembl_gene_id = rep("SPXB", 5),
            uniprotswissprot = rep("D4YIP5", 5),
            go_id = c("GO:0030976",
                      "GO:0000287",
                      "GO:0003824",
                      "GO:0047112",
                      "GO:0016491"),
            name_1006 = c("thiamine pyrophosphate binding",
                          "magnesium ion binding",
                          "catalytic activity",
                          "pyruvate oxidase activity",
                          "oxidoreductase activity"),
            namespace_1003 = rep("molecular_function", 5))

gsc <- loadGSC(go_database |>  select(ensembl_gene_id, go_id),
               type = "data.frame",
               go_database |>  select(go_id, name_1006))

Now, perform the gene set analysis by using piano.

Code

gsa_sZJD23 <- runGSA(geneLevelStats = setNames(de_summary$fdr_sZJD23,
                                               de_summary$GeneName),
                     directions = setNames(de_summary$logFC_sZJD23,
                                           de_summary$GeneName),
                     gsc = gsc,
                     geneSetStat = "mean",
                     adjMethod = "fdr",
                     ncpus = 4,
                     verbose = FALSE)

gsa_sZJD28 <- runGSA(geneLevelStats = setNames(de_summary$fdr_sZJD28,
                                               de_summary$GeneName),
                     directions = setNames(de_summary$logFC_sZJD28,
                                           de_summary$GeneName),
                     gsc = gsc,
                     geneSetStat = "mean",
                     adjMethod = "fdr",
                     ncpus = 4,
                     verbose = FALSE)

GSA using piano shows p-values from directional (down and up) and non-directional p-values. Here we will use as cutoff < 0.05 non-directional p-values, to then select the top distinct 15 up and down

Code

gsa_sum_sZJD23 <- GSAsummaryTable(gsa_sZJD23) |> 
  add_column(gsa_sZJD23$gsc |> 
               enframe() |> 
               rowwise() |> 
               mutate(genes = toString(value)) |> 
               select(genes)) |> 
  rename(go_id = Name) |> 
  filter(`Genes (tot)` >= 5 &
           `p (non-dir.)` <= 0.05 &
           (`p (dist.dir.dn)` <= 0.05 | `p (dist.dir.up)` <= 0.05)) |> 
  select(1, 20, 2, 16, 12, 10, 7, 4) |> 
  # add the gene counts to the GSA summary table
  add_de_count(de_sZJD23, FC = 0) |> 
  inner_join(go_database |> 
               group_by(go_id, name_1006, namespace_1003) |> 
               summarise(nGenes_gsc = n()),
             by = "go_id") |> 
  relocate(name_1006, namespace_1003, nGenes_gsc, .after = go_id) |> 
  # Reduce some long names
  mutate(name_1006 = case_when(
    (go_id == "GO:0000447") ~ "endonucleolytic cleavage in ITS1",
    (go_id == "GO:0000462") ~ "maturation of SSU-rRNA from tricistronic rRNA",
    (go_id == "GO:0000472") ~ "generate mature 5'-end of SSU-rRNA",
    (go_id == "GO:0000480") ~ "5'-ETS of tricistronic rRNA transcript",
    .default = name_1006))

# Repeat for sZJD28
gsa_sum_sZJD28 <- GSAsummaryTable(gsa_sZJD28) |> 
  add_column(gsa_sZJD28$gsc |> 
               enframe() |> 
               rowwise() |> 
               mutate(genes = toString(value)) |> 
               select(genes)) |> 
  rename(go_id = Name) |> 
  filter(`Genes (tot)` >= 5 &
           `p (non-dir.)` <= 0.05 &
           (`p (dist.dir.dn)` <= 0.05 | `p (dist.dir.up)` <= 0.05)) |> 
  select(1, 20, 2, 16, 12, 10, 7, 4) |> 
  # add the gene counts to the GSA summary table
  add_de_count(de_sZJD28, FC = 0) |> 
  inner_join(go_database |> 
               group_by(go_id, name_1006, namespace_1003) |> 
               summarise(nGenes_gsc = n()),
             by = "go_id") |> 
  relocate(name_1006, namespace_1003, nGenes_gsc, .after = go_id) |> 
  # Reduce some long names
  mutate(name_1006 = case_when(
    (go_id == "GO:0000447") ~ "endonucleolytic cleavage in ITS1",
    (go_id == "GO:0000462") ~ "maturation of SSU-rRNA from tricistronic rRNA",
    (go_id == "GO:0000472") ~ "generate mature 5'-end of SSU-rRNA",
    (go_id == "GO:0000480") ~ "5'-ETS of tricistronic rRNA transcript",
    .default = name_1006))

Let’s visualise the GSA results for distinct p-values

Code

gsa_sum_sZJD23 |> 
  gather(variable, value, -c(1:9, 12:21)) |> 
  group_by(variable) |> 
  slice_min(order_by = value, n = 10) |> 
  mutate(go_id = str_glue("{go_id} - {name_1006}")) |>
  ungroup(variable) |> 
  select(1, 3, 4, 6, starts_with("count")) |> 
  arrange(namespace_1003, 
          desc(count_up_hs + count_up_ls / count_down_hs + count_down_ls)) |> 
  mutate(go_id = factor(go_id, levels = go_id)) |> 
  gather(variable, value, - c(1:4)) |> 
  mutate(variable = factor(str_remove(variable, "count_"),
                           levels = c("up_hs",
                                      "up_ls",
                                      "ns",
                                      "down_ls",
                                      "down_hs"))) |> 
  ggplot(aes(x = value / `Genes (tot)`,
             y = go_id,
             color = namespace_1003,
             group = namespace_1003)) +
  geom_col(aes(fill = variable), position = "stack", color = FALSE) +
  geom_text(aes(x = 1.01,
                label = ifelse(variable == "up_hs",
                               `Genes (tot)`, "")),
            hjust = "left",
            nudge_x = 0.01,
            size = 2) +
  geom_point(aes(x = -0.05), shape = 15) +
  annotate("text",
           x = 1.06,
           y = 10,
           label = "Total proteins in GSA",
           angle = 90,
           vjust = 2,
           size = 2) +
  scale_color_manual(values = color_go,
                     labels = c("BP", "CC", "MF")) +
  scale_fill_manual(values = color_de,
                    labels = c("Up Hs", 
                               "Up Ls", 
                               "Ns", 
                               "Down Ls", 
                               "Down Hs")) +
  geom_vline(xintercept = seq(0, 1, by = 0.05),
             linetype = 3,
             linewidth = 0.5,
             color = "#FFFFFF") +
  labs(x = "Protein ratio", 
       y = "GO Term",
       color = "Domain",
       fill = "Significance",
       tag = "A.",
       caption = str_glue("MF: Molecular Function, ", 
                          "BP: Biological Process, ",
                          "CC: Cellular Component\n",
                          "Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n",
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

Code

# Store the plot to be save
p3_a <- last_plot() + theme(text = element_text(size = 8))

# Repeat for sZJD28
gsa_sum_sZJD28 |> 
  gather(variable, value, -c(1:9, 12:21)) |> 
  group_by(variable) |> 
  slice_min(order_by = value, n = 10) |> 
  mutate(go_id = str_glue("{go_id} - {name_1006}")) |>
  ungroup(variable) |> 
  select(1, 3, 4, 6, starts_with("count")) |> 
  arrange(namespace_1003, 
          desc(count_up_hs + count_up_ls / count_down_hs + count_down_ls)) |> 
  mutate(go_id = factor(go_id, levels = go_id)) |> 
  gather(variable, value, - c(1:4)) |> 
  mutate(variable = factor(str_remove(variable, "count_"),
                           levels = c("up_hs",
                                      "up_ls",
                                      "ns",
                                      "down_ls",
                                      "down_hs"))) |> 
  ggplot(aes(x = value / `Genes (tot)`,
             y = go_id,
             color = namespace_1003,
             group = namespace_1003)) +
  geom_col(aes(fill = variable), position = "stack", color = FALSE) +
  geom_text(aes(x = 1.01,
                label = ifelse(variable == "up_hs",
                               `Genes (tot)`, "")),
            hjust = "left",
            nudge_x = 0.01,
            size = 2) +
  geom_point(aes(x = -0.05), shape = 15) +
  annotate("text",
           x = 1.06,
           y = 14,
           label = "Total proteins in GSA",
           angle = 90,
           vjust = 2,
           size = 2) +
  scale_color_manual(values = color_go,
                     labels = c("BP", "CC", "MF")) +
  scale_fill_manual(values = color_de,
                    labels = c("Up Hs", 
                               "Up Ls", 
                               "Ns", 
                               "Down Ls", 
                               "Down Hs")) +
  geom_vline(xintercept = seq(0, 1, by = 0.05),
             linetype = 3,
             linewidth = 0.5,
             color = "#FFFFFF") +
  labs(x = "Protein ratio", 
       y = "GO Term",
       color = "Domain",
       fill = "Significance",
       tag = "B.",
       caption = str_glue("MF: Molecular Function, ", 
                          "BP: Biological Process, ",
                          "CC: Cellular Component\n",
                          "Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n",
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

Code

# Store the plot to be save
p3_b <- last_plot() + theme(text = element_text(size = 8))

R session info

Code

sessionInfo()

R version 4.5.1 (2025-06-13)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Bogota
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] patchwork_1.3.2   piano_2.26.0      limma_3.66.0      statmod_1.5.1    
 [5] ggpubr_0.6.2      ggrepel_0.9.6     ggalluvial_0.12.5 kableExtra_1.4.0 
 [9] knitr_1.50        lubridate_1.9.4   forcats_1.0.1     stringr_1.6.0    
[13] dplyr_1.1.4       purrr_1.2.0       readr_2.1.6       tidyr_1.3.1      
[17] tibble_3.3.0      ggplot2_4.0.1     tidyverse_2.0.0  

loaded via a namespace (and not attached):
 [1] snowfall_1.84-6.3    bitops_1.0-9         rlang_1.1.6         
 [4] magrittr_2.0.4       shinydashboard_0.7.3 otel_0.2.0          
 [7] compiler_4.5.1       systemfonts_1.3.1    vctrs_0.6.5         
[10] relations_0.6-15     crayon_1.5.3         pkgconfig_2.0.3     
[13] fastmap_1.2.0        backports_1.5.0      labeling_0.4.3      
[16] caTools_1.18.3       promises_1.5.0       rmarkdown_2.30      
[19] tzdb_0.5.0           ragg_1.5.0           bit_4.6.0           
[22] xfun_0.55            jsonlite_2.0.0       later_1.4.4         
[25] BiocParallel_1.44.0  broom_1.0.11         parallel_4.5.1      
[28] sets_1.0-25          cluster_2.1.8.1      R6_2.6.1            
[31] stringi_1.8.7        RColorBrewer_1.1-3   car_3.1-3           
[34] Rcpp_1.1.0           snow_0.4-4           httpuv_1.6.16       
[37] Matrix_1.7-4         igraph_2.2.1         timechange_0.3.0    
[40] tidyselect_1.2.1     rstudioapi_0.17.1    abind_1.4-8         
[43] yaml_2.3.12          gplots_3.3.0         codetools_0.2-20    
[46] lattice_0.22-7       Biobase_2.70.0       shiny_1.12.1        
[49] withr_3.0.2          S7_0.2.1             evaluate_1.0.5      
[52] xml2_1.5.1           pillar_1.11.1        carData_3.0-5       
[55] KernSmooth_2.23-26   DT_0.34.0            shinyjs_2.1.0       
[58] generics_0.1.4       vroom_1.6.7          hms_1.1.4           
[61] scales_1.4.0         gtools_3.9.5         xtable_1.8-4        
[64] marray_1.88.0        glue_1.8.0           slam_0.1-55         
[67] tools_4.5.1          data.table_1.17.8    fgsea_1.36.0        
[70] ggsignif_0.6.4       visNetwork_2.1.4     fastmatch_1.1-6     
[73] cowplot_1.2.0        grid_4.5.1           Formula_1.2-5       
[76] cli_3.6.5            textshaping_1.0.4    viridisLite_0.4.2   
[79] svglite_2.2.2        gtable_0.3.6         rstatix_0.7.3       
[82] digest_0.6.39        BiocGenerics_0.56.0  htmlwidgets_1.6.4   
[85] farver_2.1.2         htmltools_0.5.9      lifecycle_1.0.4     
[88] mime_0.13            bit64_4.6.0-1