Step 1. Load and filter snRNA-seq data. Cluster based on transcription and plot on UMAP by sample origin.

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.4.1 but the current version is
## 4.4.2; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(Matrix)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Define path
data_dir <- "/Users/landonwellendorf/Desktop/STAT530/Final/GSE223066_split-pipe_outputs"

# Load files
count_matrix <- readMM(file.path(data_dir, "DGE.mtx"))
genes <- read.csv(file.path(data_dir, "all_genes.csv"))
cell_metadata <- read.csv(file.path(data_dir, "cell_metadata.csv"))
rownames(count_matrix) <- cell_metadata$bc_wells
colnames(count_matrix) <- make.unique(genes$gene_name)  # Ensure gene names are unique

# Transpose matrix
count_matrix_t <- t(count_matrix)

# Create initial Seurat object
seurat_obj <- CreateSeuratObject(counts = count_matrix_t, meta.data = cell_metadata)
## Warning: Data is of class dgTMatrix. Coercing to dgCMatrix.
cat("Number of cells before filtering:", ncol(seurat_obj), "\n")
## Number of cells before filtering: 92384
# Now apply filtering to seurat_obj
seurat_obj <- subset(seurat_obj, subset = sample %in% c("HC1", "SOR1"))
cat("After sample filter (HC1 + SOR1):", ncol(seurat_obj), "cells\n")
## After sample filter (HC1 + SOR1): 16300 cells
# Plot RNA Counts for all cells to determine multiplet cutoff
Idents(seurat_obj) <- "all cells"

VlnPlot(seurat_obj, features = "nCount_RNA", pt.size = 0.1) +
  geom_hline(yintercept = 35000, linetype = "dashed", color = "red", linewidth = 0.5) +
  NoLegend()
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Filter high RNA count cells
seurat_obj <- subset(seurat_obj, subset = nCount_RNA < 35000)
cat("After RNA count filter (< 35,000):", ncol(seurat_obj), "cells\n")
## After RNA count filter (< 35,000): 16133 cells
# Add mitochondrial percentage
seurat_obj[["percent.mt"]] <- PercentageFeatureSet(seurat_obj, pattern = "^mt-")

# Filter high mitochondrial percentage cells
seurat_obj <- subset(seurat_obj, subset = percent.mt < 10)
cat("After mitochondrial filter (< 10%):", ncol(seurat_obj), "cells\n")
## After mitochondrial filter (< 10%): 15856 cells
# Preprocessing for filtered object
seurat_obj <- NormalizeData(seurat_obj)
## Normalizing layer: counts
seurat_obj <- FindVariableFeatures(seurat_obj)
## Finding variable features for layer counts
seurat_obj <- ScaleData(seurat_obj)
## Centering and scaling data matrix
seurat_obj <- RunPCA(seurat_obj)
## PC_ 1 
## Positive:  Npas3, Sox6, Atp1a2, Gm973, Slc1a3, Plpp3, Nwd1, Rmst, Dock1, Apoe 
##     Rfx4, Rgs20, Slc39a12, Spag16, Dnah11, Msi2, Gli3, Prdm16, Clu, Lama2 
##     Pla2g7, Farp1, Rorb, Dnah7b, Ttll8, Dnah12, Slc1a2, Gm4876, AC154683.1, Gm6145 
## Negative:  Epha6, Frmpd4, Fam19a1, Ryr3, Stxbp5l, Nrxn3, Cntnap2, Schip1, Cacna2d3, Hs6st3 
##     Rian, Pde10a, Cadps, Chrm3, Kcnh7, 9530059O14Rik, Zfp385b, Nkain3, Trpc4, Hs3st4 
##     Khdrbs2, Zmat4, Galntl6, Sh3gl2, Prkcb, Hcn1, Cntnap5c, Pde1a, Zfp804a, Cntnap5a 
## PC_ 2 
## Positive:  Agbl4, Cntnap2, Tenm4, Khdrbs3, Etl4, Chrm3, Nrxn3, Hcn1, Rian, Kcnq5 
##     Pde1a, Ppp2r2b, Dmd, Galntl6, Sv2b, Cntnap5c, Zfp385b, Nell1, Kcnc2, Runx1t1 
##     Hs6st3, Pde10a, Epha6, Ryr3, Ptprg, Grip1, AC129186.1, Prkcb, Pde4d, Kcnh7 
## Negative:  St18, Prr5l, Plp1, Mbp, Rnf220, Mobp, Mag, Gm16168, Sec14l5, Pde8a 
##     Trf, A230001M10Rik, Aspa, Plekhh1, Fa2h, Bcas1, Mog, Zfp536, Stxbp6, Erbin 
##     C030029H02Rik, Ugt8a, Gatm, Enpp2, Gm4593, Grm3, Plxdc2, Tspan2, AC110241.2, Phldb1 
## PC_ 3 
## Positive:  Dnah11, Dnah12, 1700007G11Rik, Hydin, Tmem212, Dnah6, Spag17, Cdhr3, Ccdc180, 3300002A11Rik 
##     Cfap43, Spag16, Ccdc146, Ak9, Wdr49, Lrrc36, Ak7, Frmpd2, Cfap61, C230072F16Rik 
##     4930519F16Rik, Rgs22, Unc5cl, Wdr63, Ttc21a, Cfap65, Cfap46, 2010001K21Rik, Crocc2, Dnah10 
## Negative:  Ntm, Gpc5, Ptprt, Farp1, Gm3764, Rgs20, Slc1a2, Rorb, Atp1a2, Luzp2 
##     Plpp3, Ppp2r2b, Bcan, Xylt1, Gpc6, Mertk, Sash1, Itpr2, Prdm16, Bmpr1b 
##     Nckap5, Gm6145, Npas3, Gli2, Arhgap26, Cspg5, Gldc, Grin2c, Sox5, Ptn 
## PC_ 4 
## Positive:  Trpm3, Slc4a4, Glis3, Pitpnc1, Adarb2, Prex2, Fam19a2, Shisa9, Slc1a2, Ptprz1 
##     Atp1a2, Pdzd2, Sparcl1, Cntnap5a, Rgs20, Rorb, Nwd1, Trpc6, Gm6145, Prdm16 
##     Cdh9, Cntn5, Gm3764, Gli3, Gli2, Maml3, Slc1a3, Bmpr1b, Sfxn5, Mertk 
## Negative:  Pde4b, Prr5l, Ano4, Plp1, Rnf220, Enpp2, Gm16168, Mag, Mobp, Sec14l5 
##     Mbp, Pde8a, A230001M10Rik, Phldb1, Plekhh1, Trf, Ctnna3, Bcas1, Fa2h, St18 
##     Tmem178b, Tspan2, Aspa, Sytl2, Gm4593, Ugt8a, Sox2ot, Mog, Elmo1, Pex5l 
## PC_ 5 
## Positive:  Nxph1, Dpp10, Kcnmb2, Kcnip1, Rbms3, Alk, Gad2, Dlx6os1, Maf, Btbd11 
##     Grik1, Sgcz, Nhs, Col19a1, Cntnap4, Vwc2, Cit, Gad1, Pcdh15, Lrrtm3 
##     Gria4, Gm45455, Unc5d, Erbb4, Cux2, Lhfpl3, C130073E24Rik, Adarb2, Olfm2, Ptprm 
## Negative:  Gm10754, 4921539H07Rik, Sv2b, AC129186.1, Fam19a1, Gm2115, Khdrbs3, Pde1a, Raver2, Ryr3 
##     Pex5l, Sema3e, Gm2164, 2310002F09Rik, Dkk3, Cpne7, Chrm3, Nell1, Hs3st4, Thsd4 
##     Gpc5, Rasgrp1, Ccdc88c, Cachd1, Hs6st3, Man1a, AC129186.2, Mdga1, Cpne8, Dmd
seurat_obj <- RunUMAP(seurat_obj, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:19:08 UMAP embedding parameters a = 0.9922 b = 1.112
## 14:19:08 Read 15856 rows and found 20 numeric columns
## 14:19:08 Using Annoy for neighbor search, n_neighbors = 30
## 14:19:08 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:19:11 Writing NN index file to temp file /var/folders/3_/kq6fv1_d4gl_n25bl_c016gc0000gn/T//Rtmpy5LsFn/filee2c0752a3138
## 14:19:11 Searching Annoy index using 1 thread, search_k = 3000
## 14:19:17 Annoy recall = 100%
## 14:19:17 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:19:18 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:19:19 Commencing optimization for 200 epochs, with 700702 positive edges
## 14:19:19 Using rng type: pcg
## 14:19:31 Optimization finished
DimPlot(seurat_obj, reduction = "umap", group.by = "sample", pt.size = 0.1)

Step 2. Identify the cluster where theta and gamma genes are most differentially expressed between HC1 and SOR1, ranked by p-value.

library(ggplot2)

# Initialize markers
theta_marker <- "Hcn1"
gamma_marker <- "Pvalb"

# Cluster the data
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
seurat_obj <- FindClusters(seurat_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 15856
## Number of edges: 591493
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9357
## Number of communities: 23
## Elapsed time: 3 seconds
# Visualize clusters
DimPlot(seurat_obj, reduction = "umap", label = TRUE)

# Create data frames to store p-values for each cluster for both markers
clusters <- unique(seurat_obj$seurat_clusters)
theta_p_values <- data.frame(cluster = clusters, p_value = NA, mean_diff = NA)
gamma_p_values <- data.frame(cluster = clusters, p_value = NA, mean_diff = NA)

# For each cluster, calculate differential expression of markers between HC1 and SOR1
for (cluster in clusters) {
  # Subset the data for this cluster
  cluster_cells <- subset(seurat_obj, subset = seurat_clusters == cluster)
  
  # THETA MARKER ANALYSIS
  # Check if the theta marker exists in the dataset
  if(theta_marker %in% rownames(GetAssayData(cluster_cells))) {
    # Get expression values for each sample within this cluster
    hc1_theta_expr <- GetAssayData(subset(cluster_cells, subset = sample == "HC1"), 
                                  slot = "data")[theta_marker,]
    sor1_theta_expr <- GetAssayData(subset(cluster_cells, subset = sample == "SOR1"), 
                                   slot = "data")[theta_marker,]
    
    # Only perform test if we have cells in both conditions
    if(length(hc1_theta_expr) > 0 && length(sor1_theta_expr) > 0) {
      # Perform Wilcoxon test
      theta_test_result <- wilcox.test(hc1_theta_expr, sor1_theta_expr)
      
      # Calculate mean difference (SOR1 - HC1) to determine direction of change
      theta_mean_diff <- mean(sor1_theta_expr) - mean(hc1_theta_expr)
      
      # Store p-value and mean difference
      theta_p_values$p_value[theta_p_values$cluster == cluster] <- theta_test_result$p.value
      theta_p_values$mean_diff[theta_p_values$cluster == cluster] <- theta_mean_diff
    }
  }
  
  # GAMMA MARKER ANALYSIS
  # Check if the gamma marker exists in the dataset
  if(gamma_marker %in% rownames(GetAssayData(cluster_cells))) {
    # Get expression values for each sample within this cluster
    hc1_gamma_expr <- GetAssayData(subset(cluster_cells, subset = sample == "HC1"), 
                                  slot = "data")[gamma_marker,]
    sor1_gamma_expr <- GetAssayData(subset(cluster_cells, subset = sample == "SOR1"), 
                                   slot = "data")[gamma_marker,]
    
    # Only perform test if we have cells in both conditions
    if(length(hc1_gamma_expr) > 0 && length(sor1_gamma_expr) > 0) {
      # Perform Wilcoxon test
      gamma_test_result <- wilcox.test(hc1_gamma_expr, sor1_gamma_expr)
      
      # Calculate mean difference (SOR1 - HC1) to determine direction of change
      gamma_mean_diff <- mean(sor1_gamma_expr) - mean(hc1_gamma_expr)
      
      # Store p-value and mean difference
      gamma_p_values$p_value[gamma_p_values$cluster == cluster] <- gamma_test_result$p.value
      gamma_p_values$mean_diff[gamma_p_values$cluster == cluster] <- gamma_mean_diff
    }
  }
}
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Apply FDR correction for multiple testing
theta_p_values$p_adj <- p.adjust(theta_p_values$p_value, method = "BH")
gamma_p_values$p_adj <- p.adjust(gamma_p_values$p_value, method = "BH")

# Display results
theta_p_values <- theta_p_values[order(theta_p_values$p_value),]
gamma_p_values <- gamma_p_values[order(gamma_p_values$p_value),]

cat("Theta marker (", theta_marker, ") differential expression by cluster:\n")
## Theta marker ( Hcn1 ) differential expression by cluster:
print(theta_p_values)
##    cluster      p_value    mean_diff        p_adj
## 2        4 2.818402e-15 -0.397941308 6.482324e-14
## 15      11 3.576172e-05 -0.505489907 4.112597e-04
## 20       1 3.160348e-04 -0.083186800 2.422934e-03
## 3       10 1.527639e-02 -0.260967055 7.128315e-02
## 5       17 1.834576e-02 -0.350952320 7.128315e-02
## 1        7 1.859560e-02  0.043809405 7.128315e-02
## 7       14 3.097693e-02 -0.204539766 1.017813e-01
## 8        6 9.128695e-02 -0.157650919 2.624500e-01
## 16      15 1.188421e-01 -0.004467027 3.037077e-01
## 4       21 1.685371e-01 -0.270147913 3.876354e-01
## 6       12 1.883451e-01 -0.032590714 3.938125e-01
## 11      22 4.111876e-01 -0.044602632 7.881095e-01
## 22       9 4.780596e-01  0.115085288 7.985701e-01
## 9       20 4.860862e-01 -0.270679621 7.985701e-01
## 17       3 5.810702e-01 -0.164734075 8.909742e-01
## 21      19 6.609866e-01 -0.004165118 9.460133e-01
## 23       2 7.255704e-01  0.029527531 9.460133e-01
## 14      16 7.605670e-01 -0.144554806 9.460133e-01
## 18       5 7.814892e-01  0.033107361 9.460133e-01
## 13      13 8.543277e-01  0.047799631 9.659993e-01
## 12      18 8.942526e-01  0.024370866 9.659993e-01
## 10       0 9.338391e-01  0.035820296 9.659993e-01
## 19       8 9.659993e-01  0.040135172 9.659993e-01
cat("\nGamma marker (", gamma_marker, ") differential expression by cluster:\n")
## 
## Gamma marker ( Pvalb ) differential expression by cluster:
print(gamma_p_values)
##    cluster    p_value     mean_diff     p_adj
## 1        7 0.02056889 -0.0087766240 0.1439822
## 4       21 0.22220591 -0.0200873429 0.6025724
## 2        4 0.28800658 -0.0004761319 0.6025724
## 19       8 0.44362129 -0.0049749981 0.6025724
## 12      18 0.49868850  0.0275318052 0.6025724
## 10       0 0.51649067 -0.0006921831 0.6025724
## 8        6 0.66479933 -0.0015378089 0.6647993
## 3       10        NaN  0.0000000000       NaN
## 5       17        NaN  0.0000000000       NaN
## 6       12        NaN  0.0000000000       NaN
## 7       14        NaN  0.0000000000       NaN
## 9       20        NaN  0.0000000000       NaN
## 11      22        NaN  0.0000000000       NaN
## 13      13        NaN  0.0000000000       NaN
## 14      16        NaN  0.0000000000       NaN
## 15      11        NaN  0.0000000000       NaN
## 16      15        NaN  0.0000000000       NaN
## 17       3        NaN  0.0000000000       NaN
## 18       5        NaN  0.0000000000       NaN
## 20       1        NaN  0.0000000000       NaN
## 21      19        NaN  0.0000000000       NaN
## 22       9        NaN  0.0000000000       NaN
## 23       2        NaN  0.0000000000       NaN
# Identify the clusters with the lowest p-values
best_theta_cluster <- theta_p_values$cluster[1]
best_gamma_cluster <- gamma_p_values$cluster[1]

# Create new columns in metadata to highlight the best clusters
seurat_obj$theta_highlight <- ifelse(seurat_obj$seurat_clusters == best_theta_cluster, 
                                   "Top Theta Cluster", "Other Clusters")
seurat_obj$gamma_highlight <- ifelse(seurat_obj$seurat_clusters == best_gamma_cluster, 
                                   "Top Gamma Cluster", "Other Clusters")

# Create a combined highlighting scheme
seurat_obj$oscillation_clusters <- "Other Clusters"
seurat_obj$oscillation_clusters[seurat_obj$seurat_clusters == best_theta_cluster] <- "Top Theta Cluster"
seurat_obj$oscillation_clusters[seurat_obj$seurat_clusters == best_gamma_cluster] <- "Top Gamma Cluster"

# Plot UMAP with the best theta cluster highlighted
DimPlot(seurat_obj, 
              reduction = "umap", 
              group.by = "theta_highlight",
              cols = c("Top Theta Cluster" = "red", "Other Clusters" = "grey"),
              pt.size = 0.5) +
  ggtitle(paste0("Top Theta Cluster: ", best_theta_cluster)) +
  theme(plot.title = element_text(size = 10))

# Plot UMAP with the best gamma cluster highlighted
DimPlot(seurat_obj, 
              reduction = "umap", 
              group.by = "gamma_highlight",
              cols = c("Top Gamma Cluster" = "blue", "Other Clusters" = "grey"),
              pt.size = 0.5) +
  ggtitle(paste0("Top Gamma Cluster: ", best_gamma_cluster)) +
  theme(plot.title = element_text(size = 10))

# Plot with both clusters highlighted
DimPlot(seurat_obj, 
              reduction = "umap", 
              group.by = "oscillation_clusters",
              cols = c("Top Theta Cluster" = "red", 
                      "Top Gamma Cluster" = "blue", 
                      "Other Clusters" = "grey"),
              pt.size = 0.5) +
  ggtitle("Top Oscillation Marker Clusters") +
  theme(plot.title = element_text(size = 10))

  1. Use marker genes to determine which cell types theta and gamma wave transcription is most differentially expressed in
# Narrowed down, most informative CNS cell type markers
canonical_markers <- c(
  # Neurons (general)
  "Rbfox3", "Snap25", "Syt1", "Eno2",
  
  # Glutamatergic neurons
  "Slc17a7", "Slc17a6", "Grin1",
  
  # GABAergic neurons
  "Gad1", "Gad2",
  
  # Astrocytes
  "Gfap", "Aqp4", "Slc1a2", "Slc1a3"
)

# Keep only markers present in the dataset
canonical_markers <- canonical_markers[canonical_markers %in% rownames(seurat_obj)]

# DotPlot across all clusters
DotPlot(seurat_obj, features = canonical_markers, group.by = "seurat_clusters") +
  RotatedAxis() +
  ggtitle("Canonical Cell Type Marker Expression by Cluster") +
  theme(plot.title = element_text(hjust = 0.5))

# Define cluster IDs for theta and gamma oscillation neuronal sub types
theta_cluster <- 4
gamma_cluster <- 7

# Subset Seurat object for theta and gamma clusters
theta_gamma_cells <- WhichCells(seurat_obj, idents = c(theta_cluster, gamma_cluster))
seurat_theta_gamma <- subset(seurat_obj, cells = theta_gamma_cells)

# Add a new cluster label: "theta" or "gamma"
seurat_theta_gamma$theta_gamma_label <- ifelse(
  Idents(seurat_theta_gamma) == theta_cluster, "theta", "gamma"
)

# Set new identities for plotting
Idents(seurat_theta_gamma) <- "theta_gamma_label"

# Plot canonical marker expression in theta vs gamma clusters
DotPlot(seurat_theta_gamma, features = canonical_markers) +
  RotatedAxis() +
  ggtitle("Canonical Marker Expression: Theta vs Gamma Clusters") +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Scaling data with a low number of groups may produce misleading
## results