Extract cell-specific cosine similarity matrix from latent representations for downstream analysis

import warnings
warnings.filterwarnings("ignore")
from model.train import *
from model.utils import *

load input data and build the dataset

data_path = './SVC/' 
dataset = 'data/xenium_mouse_brain' 
device = 'cuda:1'

train_image = np.load(f'{data_path}{dataset}/cell_gene_map_low_res_rep1.npz')["image"]
print("shape of total map of low resolution:",train_image.shape)
cell_median = np.median(train_image.sum((1,2,3)))

location =  np.load(f"{data_path}{dataset}/cell_location_rep1.npy") 

cell_morphology = np.load(f"{data_path}{dataset}/cell_morphology_rep1.npz")["image"]
nuclear_morphology = np.load(f"{data_path}{dataset}/nuclear_morphology_rep1.npz")["image"]
print(cell_morphology.shape) 
print(nuclear_morphology.shape)


cell_type = np.load(f'{data_path}{dataset}/pred_cell_type_rep1.npy')
print("shape of cell_type:", cell_type.shape)

cell_type_uni, cell_type_indices = np.unique(cell_type, return_inverse=True)
cell_type_label = np.eye(len(cell_type_uni), dtype=int)[cell_type_indices]
print("shape of cell_type_label:", cell_type_label.shape)

train_dataset = SVC_Dataset(
    data_ori=train_image,
    location=location,
    cell_morphology_vec=cell_morphology,
    nuclear_morphology_vec=nuclear_morphology,
    identity_vec=cell_type_label,
)
print("number of training cells:", len(train_dataset),', number of genes:', train_image.shape[1])
cell_median_train = np.median(train_image.sum((1,2,3)))

read_dir =f'{data_path}{dataset}/gene2vec_weight_xenium_mouse_brain.npy'
gene_names = np.loadtxt(f'{data_path}{dataset}/gene_names.txt', dtype=str)
gene_names = gene_names.tolist()
# np.save(f'{data_path}output/xenium_mouse_brain/gene_count_sum_rep1.npy',train_image.sum((2,3)))

shape of total map of low resolution: (93447, 228, 12, 12)
(93447, 48, 48)
(93447, 48, 48)
shape of cell_type: (93447,)
shape of cell_type_label: (93447, 42)
number of training cells: 93447 , number of genes: 228

load the trained model

ckpt_dir = "/net/mulan/home/huiwann/spatial/SVC/checkpoints/"
ckpt = torch.load(ckpt_dir +'SVC_xenium_mouse_brain.pth', map_location=device)

new_ckpt = {k.replace('module.', ''): v for k, v in ckpt['model_state_dict'].items()}

gene2vec_weight = torch.from_numpy(np.load(read_dir)).float() ##n_gene * 200
print("shape of gene2vec_weight", gene2vec_weight.shape)

model = SVC(
    gene2vec_weight = gene2vec_weight,
    cell_identity_dim = cell_type_label.shape[1],
).to(device)

model_dict = model.state_dict()
for key in model_dict:
    model_dict[key] = new_ckpt[key]
model.load_state_dict(model_dict)
model.eval()
train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = False, num_workers = 4)

shape of gene2vec_weight torch.Size([228, 200])

For storage efficiency, cosine similarity values were quantized and saved as uint16 rather than floating-point numbers. Specifically, values were linearly transformed from approximately [-1, 1] to [0, 20000] using $$ \mathrm{scaled} = \mathrm{cosine\ similarity} \times 10000 + 10000 $$ and then cast to uint16.

due to the large size of tissue-level subcellular ST data, we can run chunked extraction to avoid memory overflow

save_cosine_similarity_subset(
    model=model,
    train_loader=train_loader,
    device=device,
    cell_median=cell_median,
    out_file=f"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_1.npz",
    batch_start=0,
    batch_end=240,
)

save_cosine_similarity_subset(
    model=model,
    train_loader=train_loader,
    device=device,
    cell_median=cell_median,
    out_file=f"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_2.npz",
    batch_start=241,
    batch_end=480,
)

save_cosine_similarity_subset(
    model=model,
    train_loader=train_loader,
    device=device,
    cell_median=cell_median,
    out_file=f"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_3.npz",
    batch_start=481,
    batch_end=None,
)