Import gene expression data (TPM normalized counts) and phenotypic/clinical metadata

genes <- read.delim("GSE229904/Data/GSE229904_norm_counts_TPM_GRCh38.p13_NCBI.tsv.gz")
feno <- read.csv("GSE229904/Data/GSE229904_Annotations.csv", row.names = 1)

Download useful packages

library(tidyverse)
library(skimr)
library(gtsummary)
library(gt)
library(ggpubr)

Tidy the phenotypic dataset

feno <- feno %>%
  dplyr::select(!starts_with("character"))

colnames(feno) <- gsub("\\.ch1$", "", colnames(feno))

cols_to_factor <- c("gleason.primary", "gleason.secondary", "gleason.sum", "risk.group",  
                    "source_name", "description", "library_selection", "library_strategy",  
                    "biochemical.recurrence", "cm", "isup", "lapc", "pn", "pt",  
                    "residual_tumor", "tmprss2.erg")

valid_cols <- intersect(cols_to_factor, colnames(feno))
feno[valid_cols] <- lapply(feno[valid_cols], factor)

Is there an association between the primary Gleason pattern and preoperative PSA?

ggplot(feno, aes(x = gleason.primary, y = preop.psa)) +
  geom_jitter(width = 0.1)

Is there an association between the secondary Gleason pattern and postoperative PSA?

ggplot(feno, aes(x = gleason.secondary, y = postop.psa)) +
  geom_jitter(width = 0.1)

Is there an association between the secondary Gleason pattern and preoperative PSA?

ggplot(feno, aes(x = gleason.secondary, y = preop.psa)) +
  geom_jitter(width = 0.1)

Is there an association between the primary Gleason pattern and postoperative PSA?

ggplot(feno, aes(x = gleason.primary, y = postop.psa)) +
  geom_jitter(width = 0.1)

For getting a better summary of patient characteristics:

General summary provided by R using table feno

tab1 <- feno %>% select(-title, -geo_accession, -gleason.primary, -gleason.secondary) %>% 
  tbl_summary()

Create a new table called feno1 from feno, keeping only the rows corresponding to RNA-Seq and primary tumor samples. So, pay attention in columns “library_strategy” and “source_name_ch1”). From now on, use this filtered data using these new tables. Adapt the genes table to match feno1; the result is called genes1

feno1 <- feno %>%
  dplyr::filter(library_strategy == "RNA-Seq" & source_name_ch1 == "primary tumor")

genes1 <- genes %>%
  dplyr::select('GeneID', any_of(feno1$geo_accession))

Create a summary table of feno1 using gtsummary, excluding irrelevant columns Save the summary table as a .png file

tab2 <- feno1 %>%
  select(-title, -geo_accession, -gleason.primary, -gleason.secondary,
         -source_name_ch1, -library_strategy, -description, -library_selection) %>%
  tbl_summary()

Same gt summary table as before, but now stratified by a specific variable: biochemical.recurrence

tab3 <- feno1 %>%
  select(-title, -geo_accession, -gleason.primary, -gleason.secondary,
         -source_name_ch1, -library_strategy, -description, -library_selection) %>%
  tbl_summary(by = biochemical.recurrence)

Same gt summary table as before, but now stratified by a specific variable: risk.group

tab4=feno1 %>% select(-title,-geo_accession,-gleason.primary,-gleason.secondary,-source_name_ch1, -library_strategy, -description, -library_selection) %>% 
  tbl_summary(by=risk.group)

Boxplots to determine the age distribution of patients who experience recurrence

  1. Evaluated using biochemical recurrence status
ggplot(data = feno1 %>% filter(biochemical.recurrence != "NA"), 
       aes(x = biochemical.recurrence, y = age)) +
  geom_boxplot(fill = "violet") +
  labs(title = "Age distribution by biochemical recurrence status",
       x = "Biochemical recurrence", 
       y = "Age (years)") +
  theme_minimal() +
  scale_y_continuous(breaks = seq(floor(min(feno1$age, na.rm = TRUE) / 5) * 5, 
                                  ceiling(max(feno1$age, na.rm = TRUE) / 5) * 5, 
                                  by = 5)) +
  stat_compare_means(method = "t.test", label = "p.format")

  1. Evaluated based on postoperative PSA levels
ggplot(feno1, aes(x = postop.psa >= 0.3, y = age)) +
  geom_boxplot(fill = "violet") +
  scale_x_discrete(labels = c(">0.3", "<=0.3")) +
  labs(title = "Age distribution by postoperative PSA levels",
       x = "Postoperative PSA levels", 
       y = "Age (years)") +
  theme_minimal()

  1. Evaluated based on residual tumor status (R0 vs R1)
ggplot(feno1, aes(x = r.residual_tumor, y = age)) +
  geom_boxplot(fill = "violet") +
  labs(title = "Age distribution by residual tumor status", 
       x = "Residual tumor", 
       y = "Age (years)") +
  theme_minimal()

Is there an association between age and preoperative PSA?

ggplot(feno1, aes(x = age, y = preop.psa, color = gleason.sum)) +
  geom_jitter(alpha = 0.6, width = 0.2, height = 0.2) +
  geom_smooth(method = "lm", color = "black", se = TRUE, fill = "gray70") + 
  labs(title = "Relationship between age and preoperative PSA levels",
       x = "Age (years)", 
       y = "Preoperative PSA levels",
       color = "Gleason Score") +
  theme_minimal() +
  scale_color_brewer(palette = "Paired") 

Logistic regresion

This step is FUNDAMENTAL because it defines “brf” as improvement and “bcr” as deterioration

table(feno1$biochemical.recurrence)
## 
## bcr brf 
##  25  84
class(feno1$biochemical.recurrence)
## [1] "factor"
feno1$biochemical.recurrence <- factor(feno1$biochemical.recurrence, levels = c("brf", "bcr"))

Logistic regression between biochemical recurrence and preoperative PSA

modelo <- glm(biochemical.recurrence ~ preop.psa, data = feno1, family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = biochemical.recurrence ~ preop.psa, family = binomial, 
##     data = feno1)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.49699    0.54266  -4.601  4.2e-06 ***
## preop.psa    0.07707    0.03032   2.542    0.011 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 90.072  on 89  degrees of freedom
## Residual deviance: 82.814  on 88  degrees of freedom
##   (50 observations deleted due to missingness)
## AIC: 86.814
## 
## Number of Fisher Scoring iterations: 4

Logistic regression between biochemical recurrence and Gleason score

modelo2 <- glm(biochemical.recurrence ~ gleason.sum, data = feno1, family = binomial)
summary(modelo2)
## 
## Call:
## glm(formula = biochemical.recurrence ~ gleason.sum, family = binomial, 
##     data = feno1)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -2.944      1.026  -2.870  0.00410 **
## gleason.sum7    1.459      1.075   1.358  0.17453   
## gleason.sum8    2.539      1.212   2.095  0.03620 * 
## gleason.sum9    3.414      1.174   2.909  0.00362 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 116.87  on 107  degrees of freedom
## Residual deviance: 100.91  on 104  degrees of freedom
##   (32 observations deleted due to missingness)
## AIC: 108.91
## 
## Number of Fisher Scoring iterations: 5

Logistic regression between gene expression levels and the variable biochemical.recurrence

We first need to combine the phenotypic and genotypic datasets

# Selected genes by their GeneID
muestra <- c(367, 7534, 10397, 348, 1397, 10631)

# Find indices of the genes of interest
indices <- which(genes1$GeneID %in% muestra)

# Subset the gene expression table using those indices
nueva_tabla <- genes1[indices,]

# Rename gene IDs to gene names
nueva_tabla[1,1] <- "YWHAZ"
nueva_tabla[2,1] <- "NDRG1"
nueva_tabla[3,1] <- "POSTN"
nueva_tabla[4,1] <- "CRIP2"
nueva_tabla[5,1] <- "APOE"
nueva_tabla[6,1] <- "AR"

# Set gene names as row names and remove GeneID column
row.names(nueva_tabla) <- nueva_tabla$GeneID
nueva_tabla = nueva_tabla[-1]

# Transpose to have patients in rows and genes in columns
genes_translocados <- t(nueva_tabla)


#merge the clinical and molecular data using merge. I use geo_accession as the reference column that contains the patient IDs.
tabla_unificada <- merge.data.frame(feno1, genes_translocados, by.x = "geo_accession", by.y = "row.names", all = TRUE)
View(tabla_unificada)

Now we are ready to start the Logistic regression between biochemical recurrence and selected genes.

Plot describing logistic regresion:

ggplot(tabla_gen_est_pv, aes(x = Estimate, y = reorder(Gene, Estimate), fill = P_Value)) +
  geom_col(width = 0.4, color = "black") +  # Narrower bars
  scale_fill_gradient(low = "black", high = "white", name = "P-Value") +  # Fill gradient
  scale_x_continuous(limits = c(-0.05, 0.05)) +
  theme_minimal() +  # Clean theme
  labs(x = "Estimate", y = NULL, title = "Regression Coefficients and Significance") +
  theme(axis.text.y = element_blank()) +  # Remove Y axis labels
  geom_text(aes(label = Gene), hjust = ifelse(tabla_gen_est_pv$Estimate > 0, -0.2, 1.2), size = 4) +  # Gene names next to bars
  theme(legend.position = "right")  

# Place legend to the right

Logistic regression between biochemical.recurrence and clinical data

Plot discribing the logistic regresion:

tabla_final$color <- ifelse(tabla_final$P_Value >= 0.05, NA, tabla_final$P_Value)

ggplot(tabla_final, aes(x = Estimate, y = reorder(Dato, Estimate), fill = color)) +
  geom_col(width = 0.6, color = "black") +  # Black border for all bars
  scale_x_continuous(expand = c(0, 0)) +  # Adjust X scale
  scale_fill_gradient(low = "pink", high = "red", name = "P-Value", na.value = "white") +  
  theme_minimal() +  
  labs(x = "Estimate", y = NULL, title = "Regression Coefficients of Clinical Variables") +
  theme(axis.text.y = element_text(size = 10), legend.position = "right")

Diferenttial gene expression (DGE)

Analyzing biochemical recurrence against the whole gene expression data

Some usefull questions:

# How many genes have FDR less than 0.05?
# Genes with FDR < 0.05
significativos_005 <- which(fdr < 0.05, arr.ind = TRUE)
cantidad_significativos_005 <- length(significativos_005) 
cantidad_significativos_005 #1253
## [1] 1253
# Which are the 20 genes with the lowest FDR? --> the most significant ones!!!
orden_fdr <- order(fdr, na.last = NA)
veinte_menores_FDR <- orden_fdr[1:20]
veinte_menores_FDR
##  [1] 26355 11892  2689  4067 20278  4081  4579 31926  6866   734  1118  1285
## [13]  1871  2740  2868  3004  7908  8495 11747 11782
# Which are the 20 genes with the highest absolute log2 fold-change?
orden_logFC <- order(abs(foldChanges), decreasing = TRUE, na.last = NA)
veinte_mayores_LFC <- orden_logFC[1:20]
veinte_mayores_LFC
##  [1] 133 150 156 170 171 176 196 316 319 321 324 325 328 334 336 337 341 342 345
## [20] 349

Volcano plot for visualice DGE:

library(ggrepel)

# Get the 20 most significant genes, i.e., with the lowest FDR
top_genes <- rownames(outRst)[orden_fdr[1:20]]
outRst$gene <- rownames(outRst)

volcanoplot <- ggplot(outRst, aes(x = log2foldChange,
                                  y = -log10(FDR),
                                  color = Significativo,
                                  text = paste0("Gene: ", gene,
                                                "\nlog2FC: ", round(log2foldChange, 2),
                                                "\nFDR: ", signif(FDR, 3)))) +
  geom_point(alpha = 0.8) +
  scale_color_manual(values = c("Significant" = "red", "Not significant" = "gray")) +
  labs(title = "Volcano Plot",
       x = "Log2 Fold Change",
       y = "-log10(FDR)") +
  theme_minimal() +
  geom_text_repel(data = outRst[top_genes, ],
                  aes(label = gene),
                  size = 3, max.overlaps = 10)