- Use scanpy to normalize the spots in sample GSM6939158_Sample5HC and then cluster based on transcription, such that brain subregions separate, then visualize on the tissue section.
In [17]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
In [18]:
# Load Visium data for the specified sample
adata = sc.read_visium("/Users/landonwellendorf/Desktop/STAT530/Final/")
# Preprocessing
sc.pp.filter_cells(adata, min_counts=200) # Filter low-quality spots
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4) # Normalize counts per spot
sc.pp.log1p(adata) # Log transformation
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata = adata[:, adata.var.highly_variable] # Keep only variable genes
# Dimensionality reduction
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=10) # Build nearest-neighbor graph
# Clustering using Leiden algorithm
sc.tl.leiden(adata, resolution=0.8) # Adjust resolution to get subregion separation
# Visualize clusters on UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color="leiden", title="UMAP of Clusters")
# Spatial visualization of clusters
sc.pl.spatial(adata, color="leiden", title="Spatial Clustering of Hippocampus")
plt.show()
/var/folders/3_/kq6fv1_d4gl_n25bl_c016gc0000gn/T/ipykernel_1548/297046353.py:2: FutureWarning: Use `squidpy.read.visium` instead. adata = sc.read_visium("/Users/landonwellendorf/Desktop/STAT530/Final/") /opt/anaconda3/envs/stat530hw5/lib/python3.12/site-packages/anndata/_core/anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var") /opt/anaconda3/envs/stat530hw5/lib/python3.12/site-packages/anndata/_core/anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var") /opt/anaconda3/envs/stat530hw5/lib/python3.12/site-packages/anndata/_core/anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var") /opt/anaconda3/envs/stat530hw5/lib/python3.12/site-packages/anndata/_core/anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var") /opt/anaconda3/envs/stat530hw5/lib/python3.12/site-packages/scanpy/preprocessing/_pca/__init__.py:383: ImplicitModificationWarning: Setting element `.obsm['X_pca']` of view, initializing view as actual. adata.obsm[key_obsm] = X_pca /opt/anaconda3/envs/stat530hw5/lib/python3.12/site-packages/anndata/_core/anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var") /var/folders/3_/kq6fv1_d4gl_n25bl_c016gc0000gn/T/ipykernel_1548/297046353.py:17: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg. To achieve the future defaults please pass: flavor="igraph" and n_iterations=2. directed must also be False to work with igraph's implementation. sc.tl.leiden(adata, resolution=0.8) # Adjust resolution to get subregion separation
/var/folders/3_/kq6fv1_d4gl_n25bl_c016gc0000gn/T/ipykernel_1548/297046353.py:24: FutureWarning: Use `squidpy.pl.spatial_scatter` instead. sc.pl.spatial(adata, color="leiden", title="Spatial Clustering of Hippocampus")
In [15]:
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
# Load Visium data
visium_path = "/Users/landonwellendorf/Desktop/STAT530/Final/"
adata = sc.read_visium(visium_path)
# Basic preprocessing
sc.pp.filter_cells(adata, min_counts=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# Key markers identified from scRNA-seq analysis
# Primary markers (single gene approach)
theta_marker = "Hcn1" # Primary theta marker
gamma_marker = "Pvalb" # Primary gamma marker
# Define marker sets (combined markers approach)
theta_markers = ["Hcn1", "Slc17a7", "Grin1", "Rbfox3", "Snap25"]
gamma_markers = ["Pvalb", "Gad1", "Gad2"]
# Filter marker sets to include only available genes
theta_markers = [gene for gene in theta_markers if gene in adata.var_names]
gamma_markers = [gene for gene in gamma_markers if gene in adata.var_names]
print(f"Found theta markers: {theta_markers}")
print(f"Found gamma markers: {gamma_markers}")
# Approach 1: Single marker analysis
if theta_marker in adata.var_names and gamma_marker in adata.var_names:
# Calculate ratio of primary markers
adata.obs['theta_gamma_ratio'] = adata[:, theta_marker].X.toarray().flatten() / \
(adata[:, gamma_marker].X.toarray().flatten() + 0.1)
# Z-score transformation for better visualization
adata.obs['theta_gamma_ratio_z'] = zscore(adata.obs['theta_gamma_ratio'])
# Identify spots with high differential expression
# High theta spots (top 10%)
theta_threshold = np.percentile(adata.obs['theta_gamma_ratio'], 90)
adata.obs['high_theta_spots'] = adata.obs['theta_gamma_ratio'] > theta_threshold
# High gamma spots (bottom 10%)
gamma_threshold = np.percentile(adata.obs['theta_gamma_ratio'], 10)
adata.obs['high_gamma_spots'] = adata.obs['theta_gamma_ratio'] < gamma_threshold
# Visualize primary marker expression
# Plot theta marker expression
plt.figure(figsize=(8, 8))
sc.pl.spatial(adata, color=theta_marker, title=f'Expression of {theta_marker}')
plt.tight_layout()
plt.close()
hcn1_values = adata[:, 'Hcn1'].X.toarray().flatten()
max_idx = np.argmax(hcn1_values)
outlier_spot = adata.obs_names[max_idx]
print(f"Removing outlier spot {outlier_spot} with Hcn1 expression value: {hcn1_values[max_idx]}")
# Create a filtered dataset excluding the outlier
adata_filtered = adata[adata.obs_names != outlier_spot].copy()
# Replot the theta marker expression without the outlier
plt.figure(figsize=(8, 8))
sc.pl.spatial(adata_filtered, color='Hcn1', title='Expression of Hcn1 (Outlier Removed)')
plt.tight_layout()
# Visualize gamma marker expression
plt.figure(figsize=(8, 8))
sc.pl.spatial(adata, color=gamma_marker, title=f'Expression of {gamma_marker}')
plt.tight_layout()
plt.close()
Found theta markers: ['Hcn1', 'Slc17a7', 'Grin1', 'Rbfox3', 'Snap25'] Found gamma markers: ['Pvalb', 'Gad1', 'Gad2']
<Figure size 800x800 with 0 Axes>
Removing outlier spot CACAGTCCCGCTTCGC-1 with Hcn1 expression value: 2.341661214828491
<Figure size 800x800 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 800x800 with 0 Axes>
In [16]:
# Approach 2: Combined marker analysis (generally more robust)
# Define marker sets from your scRNA-seq analysis
theta_markers = ["Hcn1", "Slc17a7", "Grin1", "Rbfox3", "Snap25", "Syt1"]
gamma_markers = ["Pvalb", "Gad1", "Gad2", "Kcnc1"]
# Function to get expression for a gene, summing gene expression for each spot
def get_gene_expression(adata, gene_name):
# Find all occurrences of the gene (exact match)
matching_indices = [i for i, name in enumerate(adata.var_names) if name == gene_name]
if not matching_indices:
print(f"Warning: Gene {gene_name} not found")
return np.zeros(adata.shape[0])
# Sum expression across all matching occurrences
expr = np.zeros(adata.shape[0])
for idx in matching_indices:
expr += adata.X[:, idx].toarray().flatten()
return expr
# Calculate marker scores by manual averaging
print("Calculating multi-marker scores with manual approach...")
# Theta score calculation
theta_expr = np.zeros(adata_filtered.shape[0])
theta_count = 0
for gene in theta_markers:
gene_expr = get_gene_expression(adata_filtered, gene)
if np.sum(gene_expr) > 0: # Only include genes that were found
theta_expr += gene_expr
theta_count += 1
if theta_count > 0:
theta_expr /= theta_count
print(f"Averaged {theta_count} theta markers")
else:
print("Warning: No theta markers found")
# Gamma score calculation
gamma_expr = np.zeros(adata_filtered.shape[0])
gamma_count = 0
for gene in gamma_markers:
gene_expr = get_gene_expression(adata_filtered, gene)
if np.sum(gene_expr) > 0: # Only include genes that were found
gamma_expr += gene_expr
gamma_count += 1
if gamma_count > 0:
gamma_expr /= gamma_count
print(f"Averaged {gamma_count} gamma markers")
else:
print("Warning: No gamma markers found")
# Store the scores
adata_filtered.obs['theta_score'] = theta_expr
adata_filtered.obs['gamma_score'] = gamma_expr
# Calculate theta/gamma ratio
adata_filtered.obs['theta_gamma_score_ratio'] = theta_expr / (gamma_expr + 0.1) # Add small constant to avoid division by zero
adata_filtered.obs['theta_gamma_score_ratio_z'] = zscore(adata_filtered.obs['theta_gamma_score_ratio'])
# Generate visualizations
# Visualize the multi-marker theta score
plt.figure(figsize=(8, 8))
sc.pl.spatial(adata_filtered, color='theta_score', title='Theta Score (Multi-marker)')
plt.tight_layout()
plt.close()
# Visualize the multi-marker gamma score
plt.figure(figsize=(8, 8))
sc.pl.spatial(adata_filtered, color='gamma_score', title='Gamma Score (Multi-marker)')
plt.tight_layout()
plt.close()
print("Multi-marker analysis completed")
Calculating multi-marker scores with manual approach... Averaged 6 theta markers Averaged 4 gamma markers
<Figure size 800x800 with 0 Axes>
<Figure size 800x800 with 0 Axes>
Multi-marker analysis completed