Extract cell-specific cosine similarity matrix from latent representations for downstream analysis
import warnings
warnings.filterwarnings("ignore")
from model.train import *
from model.utils import *
load input data and build the dataset
data_path = './SVC/'
dataset = 'data/xenium_mouse_brain'
device = 'cuda:1'
train_image = np.load(f'{data_path}{dataset}/cell_gene_map_low_res_rep1.npz')["image"]
print("shape of total map of low resolution:",train_image.shape)
cell_median = np.median(train_image.sum((1,2,3)))
location = np.load(f"{data_path}{dataset}/cell_location_rep1.npy")
cell_morphology = np.load(f"{data_path}{dataset}/cell_morphology_rep1.npz")["image"]
nuclear_morphology = np.load(f"{data_path}{dataset}/nuclear_morphology_rep1.npz")["image"]
print(cell_morphology.shape)
print(nuclear_morphology.shape)
cell_type = np.load(f'{data_path}{dataset}/pred_cell_type_rep1.npy')
print("shape of cell_type:", cell_type.shape)
cell_type_uni, cell_type_indices = np.unique(cell_type, return_inverse=True)
cell_type_label = np.eye(len(cell_type_uni), dtype=int)[cell_type_indices]
print("shape of cell_type_label:", cell_type_label.shape)
train_dataset = SVC_Dataset(
data_ori=train_image,
location=location,
cell_morphology_vec=cell_morphology,
nuclear_morphology_vec=nuclear_morphology,
identity_vec=cell_type_label,
)
print("number of training cells:", len(train_dataset),', number of genes:', train_image.shape[1])
cell_median_train = np.median(train_image.sum((1,2,3)))
read_dir =f'{data_path}{dataset}/gene2vec_weight_xenium_mouse_brain.npy'
gene_names = np.loadtxt(f'{data_path}{dataset}/gene_names.txt', dtype=str)
gene_names = gene_names.tolist()
# np.save(f'{data_path}output/xenium_mouse_brain/gene_count_sum_rep1.npy',train_image.sum((2,3)))
shape of total map of low resolution: (93447, 228, 12, 12)
(93447, 48, 48)
(93447, 48, 48)
shape of cell_type: (93447,)
shape of cell_type_label: (93447, 42)
number of training cells: 93447 , number of genes: 228
load the trained model
ckpt_dir = "/net/mulan/home/huiwann/spatial/SVC/checkpoints/"
ckpt = torch.load(ckpt_dir +'SVC_xenium_mouse_brain.pth', map_location=device)
new_ckpt = {k.replace('module.', ''): v for k, v in ckpt['model_state_dict'].items()}
gene2vec_weight = torch.from_numpy(np.load(read_dir)).float() ##n_gene * 200
print("shape of gene2vec_weight", gene2vec_weight.shape)
model = SVC(
gene2vec_weight = gene2vec_weight,
cell_identity_dim = cell_type_label.shape[1],
).to(device)
model_dict = model.state_dict()
for key in model_dict:
model_dict[key] = new_ckpt[key]
model.load_state_dict(model_dict)
model.eval()
train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = False, num_workers = 4)
shape of gene2vec_weight torch.Size([228, 200])
For storage efficiency, cosine similarity values were quantized and saved as uint16 rather than floating-point numbers. Specifically, values were linearly transformed from approximately [-1, 1] to [0, 20000] using $$ \mathrm{scaled} = \mathrm{cosine\ similarity} \times 10000 + 10000 $$ and then cast to uint16.
due to the large size of tissue-level subcellular ST data, we can run chunked extraction to avoid memory overflow
save_cosine_similarity_subset(
model=model,
train_loader=train_loader,
device=device,
cell_median=cell_median,
out_file=f"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_1.npz",
batch_start=0,
batch_end=240,
)
save_cosine_similarity_subset(
model=model,
train_loader=train_loader,
device=device,
cell_median=cell_median,
out_file=f"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_2.npz",
batch_start=241,
batch_end=480,
)
save_cosine_similarity_subset(
model=model,
train_loader=train_loader,
device=device,
cell_median=cell_median,
out_file=f"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_3.npz",
batch_start=481,
batch_end=None,
)