Data Integration Analysis

Deconvolution of Visium data using snRNA-seq data

# Load necessary libraries
library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 'SeuratObject' was built under R 4.4.1 but the current version is
## 4.4.2; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(Matrix)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(spacexr)
library(ggplot2)

# Define paths
data_dir <- "/Users/landonwellendorf/Desktop/STAT530/Final/GSE223066_split-pipe_outputs"
spatial_dir <- "/Users/landonwellendorf/Desktop/STAT530/Final/"

# Load snRNA-seq data files
count_matrix <- readMM(file.path(data_dir, "DGE.mtx"))
genes <- read.csv(file.path(data_dir, "all_genes.csv"))
cell_metadata <- read.csv(file.path(data_dir, "cell_metadata.csv"))

# Filter for SOR1 sample only
sor1_cells <- cell_metadata$sample == "SOR1"
cell_metadata <- cell_metadata[sor1_cells, ]
count_matrix <- count_matrix[sor1_cells, ]

# Set up count matrix with proper row and column names
count_matrix <- t(count_matrix) # Genes should be rows, cells should be columns
rownames(count_matrix) <- make.unique(genes$gene_name)
colnames(count_matrix) <- cell_metadata$bc_wells

# Remove duplicate gene names if any
count_matrix <- count_matrix[!duplicated(rownames(count_matrix)), ]

# Create Seurat object and perform standard preprocessing
sn_obj <- CreateSeuratObject(counts = count_matrix, meta.data = cell_metadata)

## Warning: Data is of class dgTMatrix. Coercing to dgCMatrix.

sn_obj[["percent.mt"]] <- PercentageFeatureSet(sn_obj, pattern = "^mt-")

# QC filtering
sn_obj <- subset(sn_obj, subset = nCount_RNA < 35000 & percent.mt < 10)

# Normalize, find variable features, scale data, and run PCA
sn_obj <- NormalizeData(sn_obj)

## Normalizing layer: counts

sn_obj <- FindVariableFeatures(sn_obj, nfeatures = 2000)

## Finding variable features for layer counts

sn_obj <- ScaleData(sn_obj)

## Centering and scaling data matrix

sn_obj <- RunPCA(sn_obj, verbose = FALSE)

# Find neighbors and clusters
sn_obj <- FindNeighbors(sn_obj, dims = 1:30)

## Computing nearest neighbor graph

## Computing SNN

sn_obj <- FindClusters(sn_obj, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7237
## Number of edges: 291449
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9051
## Number of communities: 19
## Elapsed time: 0 seconds

# Run UMAP for visualization
sn_obj <- RunUMAP(sn_obj, dims = 1:30)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 12:43:15 UMAP embedding parameters a = 0.9922 b = 1.112

## 12:43:15 Read 7237 rows and found 30 numeric columns

## 12:43:15 Using Annoy for neighbor search, n_neighbors = 30

## 12:43:15 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 12:43:16 Writing NN index file to temp file /var/folders/3_/kq6fv1_d4gl_n25bl_c016gc0000gn/T//RtmpBu5mIu/file87f48d948b3
## 12:43:16 Searching Annoy index using 1 thread, search_k = 3000
## 12:43:17 Annoy recall = 100%
## 12:43:17 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:43:18 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:43:18 Commencing optimization for 500 epochs, with 320164 positive edges
## 12:43:18 Using rng type: pcg
## 12:43:27 Optimization finished

# Define cell type markers
# Excitatory pyramidal neuron markers (associated with theta oscillations)
pyr_markers <- c("Neurod6", "Camk2a", "Slc17a7", "Satb2")

# Inhibitory interneuron markers (associated with gamma oscillations)
int_markers <- c("Gad1", "Gad2", "Sst", "Vip")

# Oscillation-related markers
theta_marker <- "Hcn1"
gamma_marker <- "Pvalb"

# Ensure all markers are present in the dataset
all_markers <- c(pyr_markers, int_markers, theta_marker, gamma_marker)
missing_markers <- all_markers[!all_markers %in% rownames(sn_obj)]
if (length(missing_markers) > 0) {
  print(paste("Warning: The following markers are missing from the dataset:", 
              paste(missing_markers, collapse = ", ")))
}

# Run differential expression to identify cluster markers
cluster_markers <- FindAllMarkers(sn_obj, 
                                 only.pos = TRUE, 
                                 min.pct = 0.25, 
                                 logfc.threshold = 0.25)

## Calculating cluster 0

## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18

# Find clusters enriched for pyramidal and interneuron markers
pyr_enriched <- cluster_markers %>%
  filter(gene %in% pyr_markers & p_val_adj < 0.05) %>%
  group_by(cluster) %>%
  summarize(marker_count = n()) %>%
  arrange(desc(marker_count))

int_enriched <- cluster_markers %>%
  filter(gene %in% int_markers & p_val_adj < 0.05) %>%
  group_by(cluster) %>%
  summarize(marker_count = n()) %>%
  arrange(desc(marker_count))

# Identify the top clusters for each cell type
pyr_cluster <- as.character(pyr_enriched$cluster[1])
int_cluster <- as.character(int_enriched$cluster[1])

print(paste("Pyramidal neuron cluster:", pyr_cluster))

## [1] "Pyramidal neuron cluster: 2"

print(paste("Interneuron cluster:", int_cluster))

## [1] "Interneuron cluster: 5"

# Check correlation between cell type markers and oscillation markers
# in single-nucleus data first
DefaultAssay(sn_obj) <- "RNA"
gene_expr <- GetAssayData(sn_obj, slot = "data")

# Define the genes to explore
pyr_expr <- colMeans(gene_expr[pyr_markers, , drop = FALSE])
int_expr <- colMeans(gene_expr[int_markers, , drop = FALSE])
hcn1_expr <- gene_expr[theta_marker, ]
pvalb_expr <- gene_expr[gamma_marker, ]

# Add to metadata
sn_obj$pyr_score <- pyr_expr
sn_obj$int_score <- int_expr

# Calculate correlations
pyr_hcn1_cor <- cor(pyr_expr, hcn1_expr, method = "spearman")
int_pvalb_cor <- cor(int_expr, pvalb_expr, method = "spearman")

print(paste("Single-nucleus correlation - Pyramidal score vs Hcn1:", pyr_hcn1_cor))

## [1] "Single-nucleus correlation - Pyramidal score vs Hcn1: 0.151936090927992"

print(paste("Single-nucleus correlation - Interneuron score vs Pvalb:", int_pvalb_cor))

## [1] "Single-nucleus correlation - Interneuron score vs Pvalb: 0.116912961262091"

# Now set up for spatial deconvolution with RCTD
# Create reference for RCTD
ref <- Reference(sn_obj[["RNA"]]$counts, sn_obj$seurat_clusters)

# Load spatial data
spatial_data <- Read10X_h5(file.path(spatial_dir, "filtered_feature_bc_matrix.h5"))
spatial_positions <- read.csv(file.path(spatial_dir, "spatial/tissue_positions_list.csv"), 
                             header = FALSE)
colnames(spatial_positions) <- c("barcode", "in_tissue", "array_row", "array_col", "x", "y")
rownames(spatial_positions) <- spatial_positions$barcode

# Create spatial RNA object
coords <- spatial_positions[, c("x", "y")]
visium_obj <- SpatialRNA(coords, spatial_data)

## Warning in SpatialRNA(coords, spatial_data): SpatialRNA: some barcodes in nUMI,
## coords, or counts were not mutually shared. Such barcodes were removed.

# Create and run RCTD
my_rctd <- create.RCTD(visium_obj, ref, max_cores = 4) # Adjust cores based on your system

## Begin: process_cell_type_info
## process_cell_type_info: number of cells in reference: 7237
## process_cell_type_info: number of genes in reference: 54232

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1667 1484 1397  599  419  274  264  170  161  132  129  100   97   78   74   72 
##   16   17   18 
##   66   29   25

## End: process_cell_type_info
## create.RCTD: getting regression differentially expressed genes: 
## get_de_genes: 0 found DE genes: 110
## get_de_genes: 1 found DE genes: 107
## get_de_genes: 2 found DE genes: 91
## get_de_genes: 3 found DE genes: 88
## get_de_genes: 4 found DE genes: 264
## get_de_genes: 5 found DE genes: 122
## get_de_genes: 6 found DE genes: 445
## get_de_genes: 7 found DE genes: 111
## get_de_genes: 8 found DE genes: 219
## get_de_genes: 9 found DE genes: 127
## get_de_genes: 10 found DE genes: 124
## get_de_genes: 11 found DE genes: 149
## get_de_genes: 12 found DE genes: 70
## get_de_genes: 13 found DE genes: 169
## get_de_genes: 14 found DE genes: 155
## get_de_genes: 15 found DE genes: 374
## get_de_genes: 16 found DE genes: 271
## get_de_genes: 17 found DE genes: 494
## get_de_genes: 18 found DE genes: 148
## get_de_genes: total DE genes: 2321
## create.RCTD: getting platform effect normalization differentially expressed genes: 
## get_de_genes: 0 found DE genes: 288
## get_de_genes: 1 found DE genes: 273
## get_de_genes: 2 found DE genes: 236
## get_de_genes: 3 found DE genes: 236
## get_de_genes: 4 found DE genes: 479
## get_de_genes: 5 found DE genes: 262
## get_de_genes: 6 found DE genes: 752
## get_de_genes: 7 found DE genes: 260
## get_de_genes: 8 found DE genes: 428
## get_de_genes: 9 found DE genes: 292
## get_de_genes: 10 found DE genes: 279
## get_de_genes: 11 found DE genes: 440
## get_de_genes: 12 found DE genes: 285
## get_de_genes: 13 found DE genes: 334
## get_de_genes: 14 found DE genes: 341
## get_de_genes: 15 found DE genes: 709
## get_de_genes: 16 found DE genes: 637
## get_de_genes: 17 found DE genes: 992
## get_de_genes: 18 found DE genes: 344
## get_de_genes: total DE genes: 4013

my_rctd <- run.RCTD(my_rctd, doublet_mode = "doublet")

## fitBulk: decomposing bulk
## chooseSigma: using initial Q_mat with sigma =  1
## Likelihood value: 2236460.73617801
## Sigma value:  0.84
## Likelihood value: 2197603.83078177
## Sigma value:  0.69
## Likelihood value: 2168217.51478491
## Sigma value:  0.61
## Likelihood value: 2156411.80067811
## Sigma value:  0.53
## Likelihood value: 2148004.34521578
## Sigma value:  0.45
## Likelihood value: 2143592.43495127
## Sigma value:  0.42
## Likelihood value: 2143089.36638795
## Sigma value:  0.41
## Likelihood value: 2143069.90306945
## Sigma value:  0.41

## [1] "gather_results: finished 1000"
## [1] "gather_results: finished 2000"

# Extract results
results <- my_rctd@results
norm_weights <- normalize_weights(results$weights)

# Get cell type proportions
pyr_props <- norm_weights[, pyr_cluster]
int_props <- norm_weights[, int_cluster]

# Load spatial data as Seurat object for visualization
vis_seurat <- Load10X_Spatial(spatial_dir)
vis_seurat <- NormalizeData(vis_seurat)

## Normalizing layer: counts

selected_spots <- rownames(norm_weights)

# Add cell type proportions to the Seurat object
vis_seurat$pyr_prop <- NA
vis_seurat$int_prop <- NA
vis_seurat@meta.data[selected_spots, "pyr_prop"] <- pyr_props
vis_seurat@meta.data[selected_spots, "int_prop"] <- int_props

# Visualize cell type proportions
p1 <- SpatialFeaturePlot(vis_seurat, features = "pyr_prop", pt.size.factor = 2) +
     ggtitle("Pyramidal Neuron Proportion")
p2 <- SpatialFeaturePlot(vis_seurat, features = "int_prop", pt.size.factor = 2) +
     ggtitle("Interneuron Proportion")
print(p1 + p2)

Spearman to test for correlation

# Add gene expression for Hcn1 and Pvalb to the spatial object
DefaultAssay(vis_seurat) <- "Spatial"

# Get normalized expression values for the markers
hcn1_spatial <- GetAssayData(vis_seurat, slot = "data")[theta_marker, ]
pvalb_spatial <- GetAssayData(vis_seurat, slot = "data")[gamma_marker, ]

# Add to metadata for correlation analysis
vis_seurat$Hcn1_expr <- NA
vis_seurat$Pvalb_expr <- NA
vis_seurat@meta.data[names(hcn1_spatial), "Hcn1_expr"] <- hcn1_spatial
vis_seurat@meta.data[names(pvalb_spatial), "Pvalb_expr"] <- pvalb_spatial

# Identify spots that have both cell type proportions and gene expression data
selected_spots_with_data <- intersect(selected_spots, rownames(vis_seurat@meta.data))

# Calculate Spearman correlations
pyr_hcn1_cor <- cor(
  vis_seurat@meta.data[selected_spots_with_data, "pyr_prop"], 
  vis_seurat@meta.data[selected_spots_with_data, "Hcn1_expr"], 
  method = "spearman", 
  use = "complete.obs"
)

int_pvalb_cor <- cor(
  vis_seurat@meta.data[selected_spots_with_data, "int_prop"], 
  vis_seurat@meta.data[selected_spots_with_data, "Pvalb_expr"], 
  method = "spearman", 
  use = "complete.obs"
)

# Print the results
print(paste("Spatial correlation - Pyramidal cells vs Hcn1:", pyr_hcn1_cor))

## [1] "Spatial correlation - Pyramidal cells vs Hcn1: 0.231929467295915"

print(paste("Spatial correlation - Interneurons vs Pvalb:", int_pvalb_cor))

## [1] "Spatial correlation - Interneurons vs Pvalb: 0.195541555842559"

Data Integration Analysis

2025-05-13