genes <- read.delim("GSE229904/Data/GSE229904_norm_counts_TPM_GRCh38.p13_NCBI.tsv.gz")
feno <- read.csv("GSE229904/Data/GSE229904_Annotations.csv", row.names = 1)
library(tidyverse)
library(skimr)
library(gtsummary)
library(gt)
library(ggpubr)
feno <- feno %>%
dplyr::select(!starts_with("character"))
colnames(feno) <- gsub("\\.ch1$", "", colnames(feno))
cols_to_factor <- c("gleason.primary", "gleason.secondary", "gleason.sum", "risk.group",
"source_name", "description", "library_selection", "library_strategy",
"biochemical.recurrence", "cm", "isup", "lapc", "pn", "pt",
"residual_tumor", "tmprss2.erg")
valid_cols <- intersect(cols_to_factor, colnames(feno))
feno[valid_cols] <- lapply(feno[valid_cols], factor)
ggplot(feno, aes(x = gleason.primary, y = preop.psa)) +
geom_jitter(width = 0.1)
ggplot(feno, aes(x = gleason.secondary, y = postop.psa)) +
geom_jitter(width = 0.1)
ggplot(feno, aes(x = gleason.secondary, y = preop.psa)) +
geom_jitter(width = 0.1)
ggplot(feno, aes(x = gleason.primary, y = postop.psa)) +
geom_jitter(width = 0.1)
General summary provided by R using table feno
tab1 <- feno %>% select(-title, -geo_accession, -gleason.primary, -gleason.secondary) %>%
tbl_summary()
Create a new table called feno1 from feno, keeping only the rows corresponding to RNA-Seq and primary tumor samples. So, pay attention in columns “library_strategy” and “source_name_ch1”). From now on, use this filtered data using these new tables. Adapt the genes table to match feno1; the result is called genes1
feno1 <- feno %>%
dplyr::filter(library_strategy == "RNA-Seq" & source_name_ch1 == "primary tumor")
genes1 <- genes %>%
dplyr::select('GeneID', any_of(feno1$geo_accession))
Create a summary table of feno1 using gtsummary, excluding irrelevant columns Save the summary table as a .png file
tab2 <- feno1 %>%
select(-title, -geo_accession, -gleason.primary, -gleason.secondary,
-source_name_ch1, -library_strategy, -description, -library_selection) %>%
tbl_summary()
Same gt summary table as before, but now stratified by a specific variable: biochemical.recurrence
tab3 <- feno1 %>%
select(-title, -geo_accession, -gleason.primary, -gleason.secondary,
-source_name_ch1, -library_strategy, -description, -library_selection) %>%
tbl_summary(by = biochemical.recurrence)
Same gt summary table as before, but now stratified by a specific variable: risk.group
tab4=feno1 %>% select(-title,-geo_accession,-gleason.primary,-gleason.secondary,-source_name_ch1, -library_strategy, -description, -library_selection) %>%
tbl_summary(by=risk.group)
ggplot(data = feno1 %>% filter(biochemical.recurrence != "NA"),
aes(x = biochemical.recurrence, y = age)) +
geom_boxplot(fill = "violet") +
labs(title = "Age distribution by biochemical recurrence status",
x = "Biochemical recurrence",
y = "Age (years)") +
theme_minimal() +
scale_y_continuous(breaks = seq(floor(min(feno1$age, na.rm = TRUE) / 5) * 5,
ceiling(max(feno1$age, na.rm = TRUE) / 5) * 5,
by = 5)) +
stat_compare_means(method = "t.test", label = "p.format")
ggplot(feno1, aes(x = postop.psa >= 0.3, y = age)) +
geom_boxplot(fill = "violet") +
scale_x_discrete(labels = c(">0.3", "<=0.3")) +
labs(title = "Age distribution by postoperative PSA levels",
x = "Postoperative PSA levels",
y = "Age (years)") +
theme_minimal()
ggplot(feno1, aes(x = r.residual_tumor, y = age)) +
geom_boxplot(fill = "violet") +
labs(title = "Age distribution by residual tumor status",
x = "Residual tumor",
y = "Age (years)") +
theme_minimal()
Is there an association between age and preoperative PSA?
ggplot(feno1, aes(x = age, y = preop.psa, color = gleason.sum)) +
geom_jitter(alpha = 0.6, width = 0.2, height = 0.2) +
geom_smooth(method = "lm", color = "black", se = TRUE, fill = "gray70") +
labs(title = "Relationship between age and preoperative PSA levels",
x = "Age (years)",
y = "Preoperative PSA levels",
color = "Gleason Score") +
theme_minimal() +
scale_color_brewer(palette = "Paired")
This step is FUNDAMENTAL because it defines “brf” as improvement and “bcr” as deterioration
table(feno1$biochemical.recurrence)
##
## bcr brf
## 25 84
class(feno1$biochemical.recurrence)
## [1] "factor"
feno1$biochemical.recurrence <- factor(feno1$biochemical.recurrence, levels = c("brf", "bcr"))
Logistic regression between biochemical recurrence and preoperative PSA
modelo <- glm(biochemical.recurrence ~ preop.psa, data = feno1, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = biochemical.recurrence ~ preop.psa, family = binomial,
## data = feno1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.49699 0.54266 -4.601 4.2e-06 ***
## preop.psa 0.07707 0.03032 2.542 0.011 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 90.072 on 89 degrees of freedom
## Residual deviance: 82.814 on 88 degrees of freedom
## (50 observations deleted due to missingness)
## AIC: 86.814
##
## Number of Fisher Scoring iterations: 4
Logistic regression between biochemical recurrence and Gleason score
modelo2 <- glm(biochemical.recurrence ~ gleason.sum, data = feno1, family = binomial)
summary(modelo2)
##
## Call:
## glm(formula = biochemical.recurrence ~ gleason.sum, family = binomial,
## data = feno1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.944 1.026 -2.870 0.00410 **
## gleason.sum7 1.459 1.075 1.358 0.17453
## gleason.sum8 2.539 1.212 2.095 0.03620 *
## gleason.sum9 3.414 1.174 2.909 0.00362 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 116.87 on 107 degrees of freedom
## Residual deviance: 100.91 on 104 degrees of freedom
## (32 observations deleted due to missingness)
## AIC: 108.91
##
## Number of Fisher Scoring iterations: 5
We first need to combine the phenotypic and genotypic datasets
# Selected genes by their GeneID
muestra <- c(367, 7534, 10397, 348, 1397, 10631)
# Find indices of the genes of interest
indices <- which(genes1$GeneID %in% muestra)
# Subset the gene expression table using those indices
nueva_tabla <- genes1[indices,]
# Rename gene IDs to gene names
nueva_tabla[1,1] <- "YWHAZ"
nueva_tabla[2,1] <- "NDRG1"
nueva_tabla[3,1] <- "POSTN"
nueva_tabla[4,1] <- "CRIP2"
nueva_tabla[5,1] <- "APOE"
nueva_tabla[6,1] <- "AR"
# Set gene names as row names and remove GeneID column
row.names(nueva_tabla) <- nueva_tabla$GeneID
nueva_tabla = nueva_tabla[-1]
# Transpose to have patients in rows and genes in columns
genes_translocados <- t(nueva_tabla)
#merge the clinical and molecular data using merge. I use geo_accession as the reference column that contains the patient IDs.
tabla_unificada <- merge.data.frame(feno1, genes_translocados, by.x = "geo_accession", by.y = "row.names", all = TRUE)
View(tabla_unificada)
Now we are ready to start the Logistic regression between biochemical recurrence and selected genes.
Plot describing logistic regresion:
ggplot(tabla_gen_est_pv, aes(x = Estimate, y = reorder(Gene, Estimate), fill = P_Value)) +
geom_col(width = 0.4, color = "black") + # Narrower bars
scale_fill_gradient(low = "black", high = "white", name = "P-Value") + # Fill gradient
scale_x_continuous(limits = c(-0.05, 0.05)) +
theme_minimal() + # Clean theme
labs(x = "Estimate", y = NULL, title = "Regression Coefficients and Significance") +
theme(axis.text.y = element_blank()) + # Remove Y axis labels
geom_text(aes(label = Gene), hjust = ifelse(tabla_gen_est_pv$Estimate > 0, -0.2, 1.2), size = 4) + # Gene names next to bars
theme(legend.position = "right")
# Place legend to the right
Plot discribing the logistic regresion:
tabla_final$color <- ifelse(tabla_final$P_Value >= 0.05, NA, tabla_final$P_Value)
ggplot(tabla_final, aes(x = Estimate, y = reorder(Dato, Estimate), fill = color)) +
geom_col(width = 0.6, color = "black") + # Black border for all bars
scale_x_continuous(expand = c(0, 0)) + # Adjust X scale
scale_fill_gradient(low = "pink", high = "red", name = "P-Value", na.value = "white") +
theme_minimal() +
labs(x = "Estimate", y = NULL, title = "Regression Coefficients of Clinical Variables") +
theme(axis.text.y = element_text(size = 10), legend.position = "right")
Analyzing biochemical recurrence against the whole gene expression data
Some usefull questions:
# How many genes have FDR less than 0.05?
# Genes with FDR < 0.05
significativos_005 <- which(fdr < 0.05, arr.ind = TRUE)
cantidad_significativos_005 <- length(significativos_005)
cantidad_significativos_005 #1253
## [1] 1253
# Which are the 20 genes with the lowest FDR? --> the most significant ones!!!
orden_fdr <- order(fdr, na.last = NA)
veinte_menores_FDR <- orden_fdr[1:20]
veinte_menores_FDR
## [1] 26355 11892 2689 4067 20278 4081 4579 31926 6866 734 1118 1285
## [13] 1871 2740 2868 3004 7908 8495 11747 11782
# Which are the 20 genes with the highest absolute log2 fold-change?
orden_logFC <- order(abs(foldChanges), decreasing = TRUE, na.last = NA)
veinte_mayores_LFC <- orden_logFC[1:20]
veinte_mayores_LFC
## [1] 133 150 156 170 171 176 196 316 319 321 324 325 328 334 336 337 341 342 345
## [20] 349
Volcano plot for visualice DGE:
library(ggrepel)
# Get the 20 most significant genes, i.e., with the lowest FDR
top_genes <- rownames(outRst)[orden_fdr[1:20]]
outRst$gene <- rownames(outRst)
volcanoplot <- ggplot(outRst, aes(x = log2foldChange,
y = -log10(FDR),
color = Significativo,
text = paste0("Gene: ", gene,
"\nlog2FC: ", round(log2foldChange, 2),
"\nFDR: ", signif(FDR, 3)))) +
geom_point(alpha = 0.8) +
scale_color_manual(values = c("Significant" = "red", "Not significant" = "gray")) +
labs(title = "Volcano Plot",
x = "Log2 Fold Change",
y = "-log10(FDR)") +
theme_minimal() +
geom_text_repel(data = outRst[top_genes, ],
aes(label = gene),
size = 3, max.overlaps = 10)