{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract cell-specific cosine similarity matrix from latent representations for downstream analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from model.train import *\n", "from model.utils import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**load input data and build the dataset**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape of total map of low resolution: (93447, 228, 12, 12)\n", "(93447, 48, 48)\n", "(93447, 48, 48)\n", "shape of cell_type: (93447,)\n", "shape of cell_type_label: (93447, 42)\n", "number of training cells: 93447 , number of genes: 228\n" ] } ], "source": [ "data_path = './SVC/' \n", "dataset = 'data/xenium_mouse_brain' \n", "device = 'cuda:1'\n", "\n", "train_image = np.load(f'{data_path}{dataset}/cell_gene_map_low_res_rep1.npz')[\"image\"]\n", "print(\"shape of total map of low resolution:\",train_image.shape)\n", "cell_median = np.median(train_image.sum((1,2,3)))\n", "\n", "location = np.load(f\"{data_path}{dataset}/cell_location_rep1.npy\") \n", "\n", "cell_morphology = np.load(f\"{data_path}{dataset}/cell_morphology_rep1.npz\")[\"image\"]\n", "nuclear_morphology = np.load(f\"{data_path}{dataset}/nuclear_morphology_rep1.npz\")[\"image\"]\n", "print(cell_morphology.shape) \n", "print(nuclear_morphology.shape)\n", "\n", "\n", "cell_type = np.load(f'{data_path}{dataset}/pred_cell_type_rep1.npy')\n", "print(\"shape of cell_type:\", cell_type.shape)\n", "\n", "cell_type_uni, cell_type_indices = np.unique(cell_type, return_inverse=True)\n", "cell_type_label = np.eye(len(cell_type_uni), dtype=int)[cell_type_indices]\n", "print(\"shape of cell_type_label:\", cell_type_label.shape)\n", "\n", "train_dataset = SVC_Dataset(\n", " data_ori=train_image,\n", " location=location,\n", " cell_morphology_vec=cell_morphology,\n", " nuclear_morphology_vec=nuclear_morphology,\n", " identity_vec=cell_type_label,\n", ")\n", "print(\"number of training cells:\", len(train_dataset),', number of genes:', train_image.shape[1])\n", "cell_median_train = np.median(train_image.sum((1,2,3)))\n", "\n", "read_dir =f'{data_path}{dataset}/gene2vec_weight_xenium_mouse_brain.npy'\n", "gene_names = np.loadtxt(f'{data_path}{dataset}/gene_names.txt', dtype=str)\n", "gene_names = gene_names.tolist()\n", "# np.save(f'{data_path}output/xenium_mouse_brain/gene_count_sum_rep1.npy',train_image.sum((2,3)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**load the trained model**" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape of gene2vec_weight torch.Size([228, 200])\n" ] } ], "source": [ "ckpt_dir = \"/net/mulan/home/huiwann/spatial/SVC/checkpoints/\"\n", "ckpt = torch.load(ckpt_dir +'SVC_xenium_mouse_brain.pth', map_location=device)\n", "\n", "new_ckpt = {k.replace('module.', ''): v for k, v in ckpt['model_state_dict'].items()}\n", "\n", "gene2vec_weight = torch.from_numpy(np.load(read_dir)).float() ##n_gene * 200\n", "print(\"shape of gene2vec_weight\", gene2vec_weight.shape)\n", "\n", "model = SVC(\n", " gene2vec_weight = gene2vec_weight,\n", " cell_identity_dim = cell_type_label.shape[1],\n", ").to(device)\n", "\n", "model_dict = model.state_dict()\n", "for key in model_dict:\n", " model_dict[key] = new_ckpt[key]\n", "model.load_state_dict(model_dict)\n", "model.eval()\n", "train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = False, num_workers = 4)\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For storage efficiency, cosine similarity values were quantized and saved as uint16 rather than floating-point numbers. Specifically, values were linearly transformed from approximately [-1, 1] to [0, 20000] using\n", "$$\n", "\\mathrm{scaled} = \\mathrm{cosine\\ similarity} \\times 10000 + 10000\n", "$$\n", "and then cast to uint16. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "due to the large size of tissue-level subcellular ST data, we can run chunked extraction to avoid memory overflow" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "save_cosine_similarity_subset(\n", " model=model,\n", " train_loader=train_loader,\n", " device=device,\n", " cell_median=cell_median,\n", " out_file=f\"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_1.npz\",\n", " batch_start=0,\n", " batch_end=240,\n", ")\n", "\n", "save_cosine_similarity_subset(\n", " model=model,\n", " train_loader=train_loader,\n", " device=device,\n", " cell_median=cell_median,\n", " out_file=f\"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_2.npz\",\n", " batch_start=241,\n", " batch_end=480,\n", ")\n", "\n", "save_cosine_similarity_subset(\n", " model=model,\n", " train_loader=train_loader,\n", " device=device,\n", " cell_median=cell_median,\n", " out_file=f\"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_3.npz\",\n", " batch_start=481,\n", " batch_end=None,\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "SVC", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.19" } }, "nbformat": 4, "nbformat_minor": 2 }