Differential expression analysis of absolute proteomics

Overview

In this section a limma-piano workflow for analyzing absolute proteomics data (TMT-iBAQ) from three S. cerevisiae strains: a Crabtree Positive, CENPK113-11C, used as reference; and two Crabtree Negative strains, sZJD23, and sZJD28.

Setup work-space

To run this pipeline the following packages are needed:

"tidyverse", "ggalluvial", "ggrepel", "statmod", "limma", "piano", "patchwork"

Additionally, a function will be needed to facilitate the analysis.

Code
case_de_type <- function(logFC, fdr) {
  
  if (logFC < 0 & fdr <= 0.01) {
    "down_hs"
  } else if (logFC < 0 & fdr <= 0.05 & fdr > 0.01) {
    "down_ls"
  } else if (logFC > 0 & fdr <= 0.01) {
    "up_hs"
  } else if (logFC > 0 & fdr <= 0.05 & fdr > 0.01) {
    "up_ls"
  } else {
    "ns"
  }
}

add_de_count <- function(x, y, FC = 1, ls = 0.05, hs = 0.01) {

#' add_de_count
#'
#' @param x a summary dataframe of GSA results
#' @param y a dataframe with differential expression results
#' @param FC a log2FC cutoff. Default 1
#' @param ls a pval to be low significant. Default 0.05
#' @param hs a pval to be high significant 
#'
#' @return a dataframe with columns with count and genes
#' @export
#'
#' @examples
  
  x <- x |> 
    add_column(genes_up_hs = "",
               count_up_hs = 0,
               genes_up_ls = "",
               count_up_ls = 0,
               genes_ns = "",
               count_ns = 0,
               genes_down_ls = "",
               count_down_ls = 0,
               genes_down_hs = "",
               count_down_hs = 0)
  
  for (i in 1:nrow(x)) {
    
    genes <- str_split(x$genes[i], ", ")
    
    for (j in genes[[1]]) {
      
      idx <- which(j == y$GeneName)
      
      if (y$logFC[idx] < FC & y$fdr[idx] <= hs) {
        x$genes_down_hs[i] <- if_else(x$genes_down_hs[i] != "",
                                      str_c(x$genes_down_hs[i], ", ", j),
                                      j)   
        x$count_down_hs[i] <- x$count_down_hs[i] + 1
      } else if (y$logFC[idx] < FC & y$fdr[idx] <= ls & y$fdr[idx] > hs) {
        x$genes_down_ls[i] <- if_else(x$genes_down_ls[i] != "",
                                      str_c(x$genes_down_ls[i], ", ", j),
                                      j)
        x$count_down_ls[i] <- x$count_down_ls[i] + 1
      } else if (y$logFC[idx] > FC & y$fdr[idx] <= hs) {
        x$genes_up_hs[i] <- if_else(x$genes_up_hs[i] != "",
                                    str_c(x$genes_up_hs[i], ", ", j),
                                    j)
        x$count_up_hs[i] <- x$count_up_hs[i] + 1
      } else if (y$logFC[idx] > FC & y$fdr[idx] <= ls & y$fdr[idx] > hs) {
        x$genes_up_ls[i] <- if_else(x$genes_up_ls[i] != "",
                                    str_c(x$genes_up_ls[i], ", ", j),
                                    j)
        x$count_up_ls[i] <- x$count_up_ls[i] + 1
      } else {
        x$genes_ns[i] <- if_else(x$genes_ns[i] != "",
                                 str_c(x$genes_ns[i], ", ", j),
                                 j)
        x$count_ns[i] <- x$count_ns[i] + 1
      }
    }
  }
  return(x)
}

Data

For this analysis the following data is needed:

  1. Abundance proteomics, the dataset used here is in [fmol/ug Protein]
  2. Total protein content in the cell, dataset is in [g/g Protein]
  3. GO terms for the genes in S. cerevisiae. This will be used for the Gene Set Analysis.
Note

ensembl data can be retrieve using biomaRt package, however, sometimes and due to internet speed it presents problems to download the data. To avoid this, alternatively, database for S. cerevisiae can be retrieve using the wget UNIX command. Ouput file can be loaded as a normal tsv file.

go_database <- biomaRt::getBM(
        attributes = c("ensembl_gene_id",
                       "uniprotswissprot",
                       "go_id",
                       "name_1006",
                       "namespace_1003"),
        mart = biomaRt::useDataset("scerevisiae_gene_ensembl",
                                   biomaRt::useMart("ensembl")))
wget -O ensembl.txt 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "scerevisiae_gene_ensembl" interface = "default" ><Attribute name = "ensembl_gene_id" /><Attribute name = "uniprotswissprot" /><Attribute name = "go_id" /><Attribute name = "name_1006" /><Attribute name = "namespace_1003" /></Dataset></Query>'

Prepare raw data

As mentioned, our data is in [fmol/ug Protein]. (i) To be consistent with the second stage of this study (Genome Scale Modeling), where [mg/gDW] will be used, (ii) to make comparable analysis bewteen strains, and (iii) to avoid false detection of differential expressed proteins due to low numbers, let’s transform the data to [ug/gDW].

Code
proteomics_ug <- abundance |>
    rowwise() |>
    # 1. Converte to ug/gDW.
    # Convert fmol/ug -> mol/ug and Mw from kDa (kg/mol) -> ug/mol
    # Convert to ug/gDW
    mutate(CENPK113_11C_1 = 
               ((abundance_WT_1 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl1[1] * 10^6,
           CENPK113_11C_2 = 
               ((abundance_WT_2 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl2[1] * 10^6,
           CENPK113_11C_3 = 
               ((abundance_WT_3 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl3[1] * 10^6,
           sZJD23_1 = 
               ((abundance_sZJD23_1 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl1[2] * 10^6,
           sZJD23_2 = 
               ((abundance_sZJD23_2 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl2[2] * 10^6,
           sZJD23_3 = 
               ((abundance_sZJD23_3 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl3[2] * 10^6,
           sZJD28_1 = 
               ((abundance_sZJD28_1 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl1[3] * 10^6,
           sZJD28_2 = 
               ((abundance_sZJD28_2 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl2[3] * 10^6,
           sZJD28_3 = 
               ((abundance_sZJD28_3 * 10^-15) * (MW_kDa * 10^9)) * 
               prot_content$repl3[3] * 10^6) |>
    select(1:4, 29:37)
Accession Description PrimaryGeneName GeneName CENPK113_11C_1 CENPK113_11C_2 CENPK113_11C_3 sZJD23_1 sZJD23_2 sZJD23_3 sZJD28_1 sZJD28_2 sZJD28_3
Q8ZND6 Phosphate acetyltransferase OS=Salmonella enterica OX=28901 GN=pta PE=3 SV=1 PTA STM2338 1304.509289 1477.844445 1172.653874 5706.989451 6347.537860 4680.379184 8502.407069 6850.306910 10277.648572
D4YIP5 Pyruvate oxidase OS=Aerococcus viridans ATCC 11563 = CCUG 4311 OX=655812 GN=spxB PE=3 SV=1 SPXB AWM76_03655 67.616249 70.265165 57.486100 778.253249 834.871350 684.528932 534.097458 476.027418 645.899803
D6VTK4 Pheromone alpha factor receptor OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=STE2 PE=1 SV=2 STE2 YFL026W 73.917485 70.141567 54.239325 15.311769 15.511993 12.565787 23.455898 15.473736 18.805507
D6W196 Truncated non-functional calcium-binding mitochondrial carrier SAL1-1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=SAL1 PE=1 SV=2 SAL1 YNL083W 6.106214 7.512049 5.811902 10.508809 11.961704 9.797355 13.072501 11.552796 14.070033
O13297 mRNA-capping enzyme subunit beta OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=CET1 PE=1 SV=2 CET1 YPL228W 13.407504 15.572914 12.658895 15.259795 15.874536 12.709740 18.052200 15.676924 20.328332
O13329 DNA replication fork-blocking protein FOB1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=FOB1 PE=1 SV=2 FOB1 YDR110W 13.157625 13.891989 10.567888 7.149232 7.173132 5.407684 9.138712 7.620268 10.671553
O13539 THO complex subunit THP2 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=THP2 PE=1 SV=1 THP2 YHR167W 4.908577 4.752081 3.626766 3.034308 3.019418 2.340897 2.086474 2.196449 2.885695
O13563 26S proteasome regulatory subunit RPN13 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=RPN13 PE=1 SV=1 RPN13 YLR421C 11.777894 13.532398 10.409479 5.060106 5.205230 4.484698 8.257870 7.609551 10.792270
O13585 Dilute domain-containing protein YPR089W OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=YPR089W PE=1 SV=2 unknown YPR089W 12.349842 13.716000 11.134884 9.054286 11.288863 8.827652 14.887530 12.178191 16.359752
O14455 60S ribosomal protein L36-B OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=RPL36B PE=1 SV=3 RPL36B YPL249C-A 2126.737210 2120.527362 1787.688821 207.961355 63.520198 51.189917 74.642963 66.429517 92.617188

Explore the raw data

First, let’s take at look at the summary of the raw proteomics data. This provides a basic description of these (min and max values, median, mean, and quantiles).

Code
proteomics_ug |>
    group_by() |> 
    reframe(across(CENPK113_11C_1:sZJD28_3, 
                     ~ c(min = min(.x),
                         max = max(.x), 
                         mean = mean(.x), 
                         median = median(.x),
                         qs_01 = quantile(.x, 0.01),
                         qs_25 = quantile(.x, 0.25),
                         qs_75 = quantile(.x, 0.75)))) |> 
    add_column(
        stats = c("min.", "max.", "mean", "median", "10%", "25%", "75%"),
        .before = "CENPK113_11C_1") |> 
    kbl(digits = 2) |> 
    kable_styling() |> 
    scroll_box(width = "100%")
stats CENPK113_11C_1 CENPK113_11C_2 CENPK113_11C_3 sZJD23_1 sZJD23_2 sZJD23_3 sZJD28_1 sZJD28_2 sZJD28_3
min. 0.01 0.02 0.02 0.02 0.01 0.00 0.01 0.02 0.03
max. 19957.58 17048.99 14613.77 12305.95 13291.44 10491.31 34795.63 30255.15 42096.73
mean 129.22 133.55 106.91 103.75 108.14 85.38 104.03 88.76 117.20
median 13.28 14.26 11.49 12.18 12.70 9.93 13.19 11.47 14.98
10% 0.26 0.33 0.26 0.13 0.12 0.08 0.18 0.16 0.21
25% 4.74 5.25 4.18 4.08 4.18 3.24 4.71 4.03 5.39
75% 41.73 45.53 35.91 36.30 37.76 30.05 39.27 33.57 43.48

We will use box plot to summarize distributions, and distribution density plot to visualize the log transformed abundances. If the data do not have well aligned box plot, then normalization is required.

Code
proteomics_ug |> 
    select(5:13) |> 
    gather(variable, value) |> 
    ggplot(aes(x = str_sub(variable, start = -1),
               y = log2(value),
               color = str_sub(variable, end = -3))) +
    geom_boxplot(outlier.shape = 1, outlier.alpha = 0.5) +
    scale_color_manual(values = color_strain) +
    labs(x = "Replicates",
         y = expr(log["2"] ~ "[ug/gDW]"),
         color = "Strains") +
    theme_minimal() +
    theme(legend.position = "bottom")

Distribution of raw values
Code
proteomics_ug |> 
    select(5:13) |> 
    gather(variable, value) |> 
    ggplot(aes(x = log2(value),
               group = variable,
               color = str_sub(variable, end = -3))) +
    geom_density() +
    scale_color_manual(values = color_strain) +
    labs(x = expr(log["2"] ~ "[ug/gDW]"),
         y = "Density",
         color = "Strains") +
    theme_minimal() +
    theme(legend.position = "bottom")

Density of raw values

The main variability is expected to come from biological differences between samples. One way of visualizing is to look at the first principal components of the PCA, then, is expected to separate samples from the different strains. Let´s visualize the experiment variability of the strains to be analyzed.

Code
pca_prot <- proteomics_ug |> 
    select(5:13) |> 
    t() |>  # we want to analyze the experiments
    prcomp()

# calculate the percentage for each PC
pca_prot$pct <- round(pca_prot$sdev^2 / sum(pca_prot$sdev^2) * 100, 2)

pca_prot$x |> 
    as_tibble() |> 
    add_column(sample = colnames(proteomics_ug |>  select(5:13))) |> 
    ggplot(aes(x = PC1, y = PC2)) +
    geom_point(aes(color = str_sub(sample, end = -3)),
               size = 4,
               alpha = 0.9) +
    geom_text(aes(label = str_sub(sample, start = -1)),
              vjust = -0.2,
              hjust = -1) +
    scale_color_manual(values = color_strain) +
    geom_hline(yintercept = 0, linetype = 2, alpha = 0.5) +
    geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
    labs(x = str_glue("PC1 ({pca_prot$pct[1]}%)"),
         y = str_glue("PC2 ({pca_prot$pct[2]}%)"),
         color = "Strains") +
    theme_minimal()

PCA of absolute proteomics quantification (raw values)

Normalization

In order to make the data comparable between replicates and strain, it is important to normalize the data. Here, quantile normalization will be used

Code
data_norm <- normalizeBetweenArrays(proteomics_ug |> 
                                      select(1, 5:13) |> 
                                      column_to_rownames(var = "Accession"),
                                    method = "quantile") |> 
  as_tibble(rownames = NA) |> 
  rownames_to_column(var = "Accession")

In low numbers, it can be notice that a comparison between samples or strains can produce false or not probably differential expression. Then, let’s filter low numbers. 0.19 is selected as the lowest 10% data values.

Using limma package for proteomics is based on the idea of treat proteomics data as micro-array data. For this, is recommended to use log-transformed values.

Code
data_filtered <- data_norm |> 
  column_to_rownames(var = "Accession") |> 
  filter(rowSums(across(CENPK113_11C_1:sZJD28_3) >= 0.19) >= 6) |> 
  rownames_to_column(var = "Accession") |> 
  rowwise() |> 
  mutate(CENPK113_11C_1 = log2(CENPK113_11C_1),
         CENPK113_11C_2 = log2(CENPK113_11C_2),
         CENPK113_11C_3 = log2(CENPK113_11C_3),
         sZJD23_1 = log2(sZJD23_1),
         sZJD23_2 = log2(sZJD23_2),
         sZJD23_3 = log2(sZJD23_3),
         sZJD28_1 = log2(sZJD28_1),
         sZJD28_2 = log2(sZJD28_2),
         sZJD28_3 = log2(sZJD28_3))

Again, let’s visualise the data, now, normalized log2 values

Code
data_filtered |> 
    select(2:10) |> 
    gather(variable, value) |> 
    ggplot(aes(x = str_sub(variable, start = -1),
               y = value,
               color = str_sub(variable, end = -3))) +
    geom_boxplot(outlier.shape = 1,
                 outlier.alpha = 0.5,
                 alpha = 0.9) +
    scale_color_manual(values = color_strain) +
    labs(x = "Replicates",
         y = expr(log["2"] ~ "[ug/gDW]"),
         color = "Strain") +
    theme_minimal() +
    theme(legend.position = "bottom",
          text = element_text(size = 10))

Distribution of normalized values
Code
data_filtered |> 
    select(2:10) |> 
    gather(variable, value) |> 
    ggplot(aes(x = value,
               group = variable,
               color = str_sub(variable, end = -3))) +
    geom_density() +
    scale_color_manual(values = color_strain) +
    labs(x = expr(log["2"] ~ "[mmol/gDW]"),
         y = "Density",
         color = "Strains") +
    theme_minimal() +
    theme(legend.position = "bottom",
          text = element_text(size = 10))

Density of normalized values

Differential analysis by using limma

To determine differential expressed proteins, here Limma package is used. In order to do that, we need to define the design matrix. We will use CENPK113 11C as reference, and compare it vs sZJD23 and sZJD28.

Code
group <- str_sub(colnames(data_filtered |>  select(2:10)), end = -3)

design <- model.matrix(~group, ref = "CENPK113_11C")

Now we can perform the differential expression pipeline and summary the results. First, do the linear model fitting. Then, do the empirical Bayes moderation of the test statistic. Finally, let’s see a summary of the results. Summary results shows Up regulated those with a log2FC higher than 0.25. In contrast, down regulated proteins are those with log2FC lower than 0.25

Code
fit <- lmFit(data_filtered, design)

fit <- eBayes(fit, robust = TRUE)

fit <- treat(fit, lfc = 0.25)

Once we have the results from limma, we need to make this information human-readable as well as, add some information to facilitate the analysis. So, Let’s take the information in topTable, then we can get the data to plot candidates.

Code
de_sZJD23 <- abundance |>  
  # Select the id information
  select(1:4) |> 
  # Add filtered data
  inner_join(data_filtered |> select(1:7),
             by = "Accession") |> 
  # Add limma results
  inner_join(topTable(fit,
                      coef = "groupsZJD23",
                      adjust = "BH",
                      sort.by = "none", # Not sort to add the protein id, etc
                      number = 10000),
             by = "Accession") |> 
  rename(pVal = P.Value, fdr = adj.P.Val)

# Repeat for sZJD28
de_sZJD28 <- abundance |>  
  select(1:4) |> 
  inner_join(data_filtered |> select(1:7),
             by = "Accession") |> 
  inner_join(topTable(fit,
                      coef = "groupsZJD28",
                      adjust = "BH",
                      sort.by = "none",
                      number = 10000),
             by = "Accession") |> 
  rename(pVal = P.Value, fdr = adj.P.Val)

# Create a summary data frame containing both condition data, and add a new column that helps in plotting
de_summary <- inner_join(x = de_sZJD23 |>  select(1:4, 11, 14:15),
                         y = de_sZJD28 |>  select(1:4, 11, 14:15),
                         by = c("Accession", "GeneName",
                                "Description", "PrimaryGeneName"),
                         suffix = c("_sZJD23", "_sZJD28")) |> 
    rowwise() |> 
    mutate(euclidean = sqrt(sum((logFC_sZJD23 - logFC_sZJD28)^2)),
        de_sZJD23 = case_de_type(logFC_sZJD23, fdr_sZJD23),
        de_sZJD28 = case_de_type(logFC_sZJD28, fdr_sZJD28),
        changes = str_c(de_sZJD23, "_sZJD23 & ", de_sZJD28, "_sZJD28"),
        # Don’t take into account high or low significance in changes
        changes = str_remove_all(changes, "ls_|hs_"))

Let’s visualize the DE results. \(fold~change=\frac{test}{ref}\). test, are sZJD23 or sZJD28, and ref is CENPK113_11C

Visualize the frequency of log2 fold changes and distributions of raw p-values computed by the statistical test for the comparisons done. This distribution is expected to be a peak around 0 corresponding to the differential expressed features.

Code
de_summary |> 
    ggplot(aes(logFC_sZJD23)) +
    geom_histogram(binwidth = 0.5, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0,
               linetype = 2,
               linewidth = 1,
               color = color_strain[2]) +
    labs(x = expr(log["2"] ~ "Fold change"),
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

de_summary |> 
    ggplot(aes(pVal_sZJD23)) +
    geom_histogram(binwidth = 0.05, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0.05,
               linetype = 2,
               linewidth = 1,
               color = color_strain[2]) +
    labs(x = "Raw p-value",
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

Fold change

Raw p-value

sZJD23 vs CENPK113_11C

Code
de_summary |> 
    ggplot(aes(logFC_sZJD28)) +
    geom_histogram(binwidth = 0.5, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0,
               linetype = 2,
               linewidth = 1,
               color = color_strain[3]) +
    labs(x = expr(log["2"] ~ "Fold change"),
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

de_summary |> 
    ggplot(aes(pVal_sZJD28)) +
    geom_histogram(binwidth = 0.05, boundary = 0.5, color = "white") +
    geom_vline(xintercept = 0.05,
               linetype = 2,
               linewidth = 1,
               color = color_strain[3]) +
    labs(x = "Raw p-value",
         y = "Frequency") +
    theme_minimal() +
    theme(text = element_text(size = 10))

Fold change

Raw p-value

sZJD23 vs CENPK113_11C

Finally, shows the volcano plots for the comparisons performed. Differentially expressed features are still highlighted in blue and red. The volcano plot represents the log of the adjusted P value as a function of the log ratio of differential expression

Code
de_summary |> 
  ggplot(aes(x = logFC_sZJD23,
             y = -log10(fdr_sZJD23),
             color = de_sZJD23)) +
  geom_point(size = 2) +
  scale_color_manual(values = color_de,
                     labels = c("Down Hs", 
                                "Down Ls", 
                                "Ns", 
                                "Up Ls", 
                                "Up Hs")) +
  geom_hline(yintercept = 1.3, linetype = 2, alpha = 0.5) +
  geom_hline(yintercept = 2, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = 0.5, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = -0.5, linetype = 2, alpha = 0.5) +
  labs(x = expr(log["2"] ~ "Fold change"),
       y = expr(-log["10"] ~ FDR),
       caption = str_glue("Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n", 
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(legend.title = element_blank(),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        text = element_text(size = 10))

de_summary |> 
  ggplot(aes(x = logFC_sZJD28,
             y = -log10(fdr_sZJD28),
             color = de_sZJD28)) +
  geom_point(size = 2) +
  scale_color_manual(values = color_de,
                     labels = c("Down Hs", 
                                "Down Ls", 
                                "Ns", 
                                "Up Ls", 
                                "Up Hs")) +
  geom_hline(yintercept = 1.3, linetype = 2, alpha = 0.5) +
  geom_hline(yintercept = 2, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = 0.5, linetype = 2, alpha = 0.5) +
  geom_vline(xintercept = -0.5, linetype = 2, alpha = 0.5) +
  labs(x = expr(log["2"] ~ "Fold change"),
       y = expr(-log["10"] ~ FDR),
       caption = str_glue('Ls: Low significant (0.01 > fdr \u2264 0.05)\n',
                          "Hs: High significant (fdr \u2264 0.01)\n", 
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(legend.title = element_blank(),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        text = element_text(size = 10))

sZJD23

sZJD28

Volcano plot

Since we had two strains that we want to compare using the same as reference. So, we can visualise better how genes are expressed between strains. For that, let’s create an alluvial plot.

Code
de_summary |> 
  select(12:14) |> 
  gather(strain, de, -changes) |> 
  mutate(strain = str_remove(strain, "de_")) |> 
  group_by(strain, de, changes) |> 
  summarise(n = n()) |> 
  mutate(group = de) |> 
  to_lodes_form(axes = 1:3, key = "de") |> 
  mutate(stratum = factor(stratum, 
                          levels = c(names(color_strain[2:3]),
                                     names(color_de),
                                     names(color_changes)))) |> 
  ggplot(aes(x = de,
             y = n,
             stratum = stratum, 
             alluvium = alluvium,
             fill = group)) +
  geom_flow(width = 0.4, alpha = 0.2) +
  geom_stratum(stat = "stratum",
               aes(fill = after_stat(stratum), 
                   color = "#FFFFFF",
                   alpha = after_stat(stratum))) +
  scale_fill_manual(values = c(color_strain[2:3],
                                        color_de,
                                        color_changes)) +
  scale_color_manual(values = "#FFFFFF") + 
  scale_alpha_manual("stratum", values = c(0.7, 0.7, rep(1, 14))) +
  geom_text(stat = "stratum",
            aes(label = after_stat(c("sZJD28",
                                     "sZJD23",
                                     "Up Hs",
                                     "Up Ls",
                                     "Ns",
                                     "Down Ls",
                                     "Down Hs",
                                     "Ns sZJD23 & Up sZJD28",
                                     "Up sZJD23 & Ns sZJD28",
                                     "Up sZJD23 & Up sZJD28",
                                     "Up sZJD23 & Down sZJD28",
                                     "Ns sZJD23 & Ns sZJD28",
                                     "Down sZJD23 & Up sZJD28",
                                     "Down sZJD23 & Down sZJD28",
                                     "Ns sZJD23 & Down sZJD28",
                                     "Down sZJD23 & Ns sZJD28"))),
            hjust = c(rep("center", 7), rep("left", 9)),
            nudge_x = c(rep(0, 7), rep(0.2, 9)),
            color = rep("#000000", 16),
            size = rep(2, 16),
            fontface = "bold") +
  geom_text(stat = "stratum", 
            aes(label = after_stat(ifelse(str_detect(stratum, " & "), count/2, count))),
            nudge_x = -0.2,
            hjust = "right",
            size = 2)  +
  scale_x_discrete(labels = c("strain" = "Strain",
                              "de" = "DE",
                              "changes" = "Changes"),
                   expand = c(0.02, 0.25, 0, 0.75)) +
  labs(x = "",
       fill = "",
       color = "",
       tag = "A.",
       caption = str_glue("Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n",
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(legend.position = "none",
        panel.grid.major.y = element_blank(),
        axis.title.y = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

Differential expression flow between strains
Code
p2_a <- last_plot() + theme(text = element_text(size = 8))

Let´s visualize the log2 fold change of sZJD23 vs sZJD28, adding the Euclidean distance between the differential expression

Code
de_summary |> 
  ggplot(aes(x = logFC_sZJD23,
             y = logFC_sZJD28,
             color = euclidean,
             size = euclidean,
             alpha = euclidean)) +
  geom_point() +
  geom_text_repel(aes(label = ifelse(euclidean > 2, PrimaryGeneName, "")),
                  size = 2,
                  color = "#000000",
                  segment.color = "#C5C5C5",
                  max.overlaps = Inf) +
  scale_alpha(range = c(0.1, 1)) + 
  scale_color_gradient(low = "#003A59",
                       high = "#FF5800",
                       guide = "legend") +
  labs(x = expr(log[2] ~ "Fold change sZJD23"),
       y = expr(log[2] ~ "Fold change sZJD28"),
       color = "Euclidean",
       size = "Euclidean",
       alpha = "Euclidean",
       tag = "B.") +
  theme_minimal() +
  theme(plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

Code
# Store the plot to be save
p2_b <- last_plot() + theme(text = element_text(size = 8))

GO analysis by using piano

For the Gene Set Analysis (GSA) will we use piano and the ensembl database of GO terms for S. cerevisiae. But, sZJD23 and sZJD28 are engineered strains containing non-native genes in yeast, so, add information about the genes (pyruvate oxidase and pta) integrated to Crabtree negative strains from quick go.

Code
go_database <- go_database |> 
    add_row(ensembl_gene_id = rep("PTA", 6),
            uniprotswissprot = rep("Q8ZND6", 6),
            go_id = c("GO:0016407",
                      "GO:0016746",
                      "GO:0008959",
                      "GO:0016740",
                      "GO:0005737",
                      "GO:0006085"),
            name_1006 = c("acetyltransferase activity",
                          "acyltransferase activity",
                          "phosphate acetyltransferase activity",
                          "transferase activity",
                          "cytoplasm",
                          "acetyl-CoA biosynthetic process"),
            namespace_1003 = c("molecular_function",
                               "molecular_function",
                               "molecular_function",
                               "molecular_function",
                               "cellular_component",
                               "biological_process")) |> 
    add_row(ensembl_gene_id = rep("SPXB", 5),
            uniprotswissprot = rep("D4YIP5", 5),
            go_id = c("GO:0030976",
                      "GO:0000287",
                      "GO:0003824",
                      "GO:0047112",
                      "GO:0016491"),
            name_1006 = c("thiamine pyrophosphate binding",
                          "magnesium ion binding",
                          "catalytic activity",
                          "pyruvate oxidase activity",
                          "oxidoreductase activity"),
            namespace_1003 = rep("molecular_function", 5))

gsc <- loadGSC(go_database |>  select(ensembl_gene_id, go_id),
               type = "data.frame",
               go_database |>  select(go_id, name_1006))

Now, perform the gene set analysis by using piano.

Code
gsa_sZJD23 <- runGSA(geneLevelStats = setNames(de_summary$fdr_sZJD23,
                                               de_summary$GeneName),
                     directions = setNames(de_summary$logFC_sZJD23,
                                           de_summary$GeneName),
                     gsc = gsc,
                     geneSetStat = "mean",
                     adjMethod = "fdr",
                     ncpus = 4,
                     verbose = FALSE)

gsa_sZJD28 <- runGSA(geneLevelStats = setNames(de_summary$fdr_sZJD28,
                                               de_summary$GeneName),
                     directions = setNames(de_summary$logFC_sZJD28,
                                           de_summary$GeneName),
                     gsc = gsc,
                     geneSetStat = "mean",
                     adjMethod = "fdr",
                     ncpus = 4,
                     verbose = FALSE)

GSA using piano shows p-values from directional (down and up) and non-directional p-values. Here we will use as cutoff < 0.05 non-directional p-values, to then select the top distinct 15 up and down

Code
gsa_sum_sZJD23 <- GSAsummaryTable(gsa_sZJD23) |> 
  add_column(gsa_sZJD23$gsc |> 
               enframe() |> 
               rowwise() |> 
               mutate(genes = toString(value)) |> 
               select(genes)) |> 
  rename(go_id = Name) |> 
  filter(`Genes (tot)` >= 5 &
           `p (non-dir.)` <= 0.05 &
           (`p (dist.dir.dn)` <= 0.05 | `p (dist.dir.up)` <= 0.05)) |> 
  select(1, 20, 2, 16, 12, 10, 7, 4) |> 
  # add the gene counts to the GSA summary table
  add_de_count(de_sZJD23, FC = 0) |> 
  inner_join(go_database |> 
               group_by(go_id, name_1006, namespace_1003) |> 
               summarise(nGenes_gsc = n()),
             by = "go_id") |> 
  relocate(name_1006, namespace_1003, nGenes_gsc, .after = go_id) |> 
  # Reduce some long names
  mutate(name_1006 = case_when(
    (go_id == "GO:0000447") ~ "endonucleolytic cleavage in ITS1",
    (go_id == "GO:0000462") ~ "maturation of SSU-rRNA from tricistronic rRNA",
    (go_id == "GO:0000472") ~ "generate mature 5'-end of SSU-rRNA",
    (go_id == "GO:0000480") ~ "5'-ETS of tricistronic rRNA transcript",
    .default = name_1006))

# Repeat for sZJD28
gsa_sum_sZJD28 <- GSAsummaryTable(gsa_sZJD28) |> 
  add_column(gsa_sZJD28$gsc |> 
               enframe() |> 
               rowwise() |> 
               mutate(genes = toString(value)) |> 
               select(genes)) |> 
  rename(go_id = Name) |> 
  filter(`Genes (tot)` >= 5 &
           `p (non-dir.)` <= 0.05 &
           (`p (dist.dir.dn)` <= 0.05 | `p (dist.dir.up)` <= 0.05)) |> 
  select(1, 20, 2, 16, 12, 10, 7, 4) |> 
  # add the gene counts to the GSA summary table
  add_de_count(de_sZJD28, FC = 0) |> 
  inner_join(go_database |> 
               group_by(go_id, name_1006, namespace_1003) |> 
               summarise(nGenes_gsc = n()),
             by = "go_id") |> 
  relocate(name_1006, namespace_1003, nGenes_gsc, .after = go_id) |> 
  # Reduce some long names
  mutate(name_1006 = case_when(
    (go_id == "GO:0000447") ~ "endonucleolytic cleavage in ITS1",
    (go_id == "GO:0000462") ~ "maturation of SSU-rRNA from tricistronic rRNA",
    (go_id == "GO:0000472") ~ "generate mature 5'-end of SSU-rRNA",
    (go_id == "GO:0000480") ~ "5'-ETS of tricistronic rRNA transcript",
    .default = name_1006))

Let’s visualise the GSA results for distinct p-values

Code
gsa_sum_sZJD23 |> 
  gather(variable, value, -c(1:9, 12:21)) |> 
  group_by(variable) |> 
  slice_min(order_by = value, n = 10) |> 
  mutate(go_id = str_glue("{go_id} - {name_1006}")) |>
  ungroup(variable) |> 
  select(1, 3, 4, 6, starts_with("count")) |> 
  arrange(namespace_1003, 
          desc(count_up_hs + count_up_ls / count_down_hs + count_down_ls)) |> 
  mutate(go_id = factor(go_id, levels = go_id)) |> 
  gather(variable, value, - c(1:4)) |> 
  mutate(variable = factor(str_remove(variable, "count_"),
                           levels = c("up_hs",
                                      "up_ls",
                                      "ns",
                                      "down_ls",
                                      "down_hs"))) |> 
  ggplot(aes(x = value / `Genes (tot)`,
             y = go_id,
             color = namespace_1003,
             group = namespace_1003)) +
  geom_col(aes(fill = variable), position = "stack", color = FALSE) +
  geom_text(aes(x = 1.01,
                label = ifelse(variable == "up_hs",
                               `Genes (tot)`, "")),
            hjust = "left",
            nudge_x = 0.01,
            size = 2) +
  geom_point(aes(x = -0.05), shape = 15) +
  annotate("text",
           x = 1.06,
           y = 10,
           label = "Total proteins in GSA",
           angle = 90,
           vjust = 2,
           size = 2) +
  scale_color_manual(values = color_go,
                     labels = c("BP", "CC", "MF")) +
  scale_fill_manual(values = color_de,
                    labels = c("Up Hs", 
                               "Up Ls", 
                               "Ns", 
                               "Down Ls", 
                               "Down Hs")) +
  geom_vline(xintercept = seq(0, 1, by = 0.05),
             linetype = 3,
             linewidth = 0.5,
             color = "#FFFFFF") +
  labs(x = "Protein ratio", 
       y = "GO Term",
       color = "Domain",
       fill = "Significance",
       tag = "A.",
       caption = str_glue("MF: Molecular Function, ", 
                          "BP: Biological Process, ",
                          "CC: Cellular Component\n",
                          "Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n",
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

GSA distinct p-values of sZJD23
Code
# Store the plot to be save
p3_a <- last_plot() + theme(text = element_text(size = 8))

# Repeat for sZJD28
gsa_sum_sZJD28 |> 
  gather(variable, value, -c(1:9, 12:21)) |> 
  group_by(variable) |> 
  slice_min(order_by = value, n = 10) |> 
  mutate(go_id = str_glue("{go_id} - {name_1006}")) |>
  ungroup(variable) |> 
  select(1, 3, 4, 6, starts_with("count")) |> 
  arrange(namespace_1003, 
          desc(count_up_hs + count_up_ls / count_down_hs + count_down_ls)) |> 
  mutate(go_id = factor(go_id, levels = go_id)) |> 
  gather(variable, value, - c(1:4)) |> 
  mutate(variable = factor(str_remove(variable, "count_"),
                           levels = c("up_hs",
                                      "up_ls",
                                      "ns",
                                      "down_ls",
                                      "down_hs"))) |> 
  ggplot(aes(x = value / `Genes (tot)`,
             y = go_id,
             color = namespace_1003,
             group = namespace_1003)) +
  geom_col(aes(fill = variable), position = "stack", color = FALSE) +
  geom_text(aes(x = 1.01,
                label = ifelse(variable == "up_hs",
                               `Genes (tot)`, "")),
            hjust = "left",
            nudge_x = 0.01,
            size = 2) +
  geom_point(aes(x = -0.05), shape = 15) +
  annotate("text",
           x = 1.06,
           y = 14,
           label = "Total proteins in GSA",
           angle = 90,
           vjust = 2,
           size = 2) +
  scale_color_manual(values = color_go,
                     labels = c("BP", "CC", "MF")) +
  scale_fill_manual(values = color_de,
                    labels = c("Up Hs", 
                               "Up Ls", 
                               "Ns", 
                               "Down Ls", 
                               "Down Hs")) +
  geom_vline(xintercept = seq(0, 1, by = 0.05),
             linetype = 3,
             linewidth = 0.5,
             color = "#FFFFFF") +
  labs(x = "Protein ratio", 
       y = "GO Term",
       color = "Domain",
       fill = "Significance",
       tag = "B.",
       caption = str_glue("MF: Molecular Function, ", 
                          "BP: Biological Process, ",
                          "CC: Cellular Component\n",
                          "Ls: Low significant (0.01 > fdr \u2264 0.05)\n",
                          "Hs: High significant (fdr \u2264 0.01)\n",
                          "Ns: No significant (fdr > 0.05)")) +
  theme_minimal() +
  theme(plot.caption.position = "plot",
        plot.caption = element_text(hjust = 0),
        plot.tag = element_text(face = "bold"),
        text = element_text(size = 10))

GSA distinct p-values of sZJD28
Code
# Store the plot to be save
p3_b <- last_plot() + theme(text = element_text(size = 8))

R session info

Code
sessionInfo()
R version 4.5.1 (2025-06-13)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Bogota
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] patchwork_1.3.2   piano_2.26.0      limma_3.66.0      statmod_1.5.1    
 [5] ggpubr_0.6.2      ggrepel_0.9.6     ggalluvial_0.12.5 kableExtra_1.4.0 
 [9] knitr_1.50        lubridate_1.9.4   forcats_1.0.1     stringr_1.6.0    
[13] dplyr_1.1.4       purrr_1.2.0       readr_2.1.6       tidyr_1.3.1      
[17] tibble_3.3.0      ggplot2_4.0.1     tidyverse_2.0.0  

loaded via a namespace (and not attached):
 [1] snowfall_1.84-6.3    bitops_1.0-9         rlang_1.1.6         
 [4] magrittr_2.0.4       shinydashboard_0.7.3 otel_0.2.0          
 [7] compiler_4.5.1       systemfonts_1.3.1    vctrs_0.6.5         
[10] relations_0.6-15     crayon_1.5.3         pkgconfig_2.0.3     
[13] fastmap_1.2.0        backports_1.5.0      labeling_0.4.3      
[16] caTools_1.18.3       promises_1.5.0       rmarkdown_2.30      
[19] tzdb_0.5.0           ragg_1.5.0           bit_4.6.0           
[22] xfun_0.55            jsonlite_2.0.0       later_1.4.4         
[25] BiocParallel_1.44.0  broom_1.0.11         parallel_4.5.1      
[28] sets_1.0-25          cluster_2.1.8.1      R6_2.6.1            
[31] stringi_1.8.7        RColorBrewer_1.1-3   car_3.1-3           
[34] Rcpp_1.1.0           snow_0.4-4           httpuv_1.6.16       
[37] Matrix_1.7-4         igraph_2.2.1         timechange_0.3.0    
[40] tidyselect_1.2.1     rstudioapi_0.17.1    abind_1.4-8         
[43] yaml_2.3.12          gplots_3.3.0         codetools_0.2-20    
[46] lattice_0.22-7       Biobase_2.70.0       shiny_1.12.1        
[49] withr_3.0.2          S7_0.2.1             evaluate_1.0.5      
[52] xml2_1.5.1           pillar_1.11.1        carData_3.0-5       
[55] KernSmooth_2.23-26   DT_0.34.0            shinyjs_2.1.0       
[58] generics_0.1.4       vroom_1.6.7          hms_1.1.4           
[61] scales_1.4.0         gtools_3.9.5         xtable_1.8-4        
[64] marray_1.88.0        glue_1.8.0           slam_0.1-55         
[67] tools_4.5.1          data.table_1.17.8    fgsea_1.36.0        
[70] ggsignif_0.6.4       visNetwork_2.1.4     fastmatch_1.1-6     
[73] cowplot_1.2.0        grid_4.5.1           Formula_1.2-5       
[76] cli_3.6.5            textshaping_1.0.4    viridisLite_0.4.2   
[79] svglite_2.2.2        gtable_0.3.6         rstatix_0.7.3       
[82] digest_0.6.39        BiocGenerics_0.56.0  htmlwidgets_1.6.4   
[85] farver_2.1.2         htmltools_0.5.9      lifecycle_1.0.4     
[88] mime_0.13            bit64_4.6.0-1