{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract cell-specific cosine similarity matrix from latent representations for downstream analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "from model.train import *\n",
    "from model.utils import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**load input data and build the dataset**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shape of total map of low resolution: (93447, 228, 12, 12)\n",
      "(93447, 48, 48)\n",
      "(93447, 48, 48)\n",
      "shape of cell_type: (93447,)\n",
      "shape of cell_type_label: (93447, 42)\n",
      "number of training cells: 93447 , number of genes: 228\n"
     ]
    }
   ],
   "source": [
    "data_path = './SVC/' \n",
    "dataset = 'data/xenium_mouse_brain' \n",
    "device = 'cuda:1'\n",
    "\n",
    "train_image = np.load(f'{data_path}{dataset}/cell_gene_map_low_res_rep1.npz')[\"image\"]\n",
    "print(\"shape of total map of low resolution:\",train_image.shape)\n",
    "cell_median = np.median(train_image.sum((1,2,3)))\n",
    "\n",
    "location =  np.load(f\"{data_path}{dataset}/cell_location_rep1.npy\") \n",
    "\n",
    "cell_morphology = np.load(f\"{data_path}{dataset}/cell_morphology_rep1.npz\")[\"image\"]\n",
    "nuclear_morphology = np.load(f\"{data_path}{dataset}/nuclear_morphology_rep1.npz\")[\"image\"]\n",
    "print(cell_morphology.shape) \n",
    "print(nuclear_morphology.shape)\n",
    "\n",
    "\n",
    "cell_type = np.load(f'{data_path}{dataset}/pred_cell_type_rep1.npy')\n",
    "print(\"shape of cell_type:\", cell_type.shape)\n",
    "\n",
    "cell_type_uni, cell_type_indices = np.unique(cell_type, return_inverse=True)\n",
    "cell_type_label = np.eye(len(cell_type_uni), dtype=int)[cell_type_indices]\n",
    "print(\"shape of cell_type_label:\", cell_type_label.shape)\n",
    "\n",
    "train_dataset = SVC_Dataset(\n",
    "    data_ori=train_image,\n",
    "    location=location,\n",
    "    cell_morphology_vec=cell_morphology,\n",
    "    nuclear_morphology_vec=nuclear_morphology,\n",
    "    identity_vec=cell_type_label,\n",
    ")\n",
    "print(\"number of training cells:\", len(train_dataset),', number of genes:', train_image.shape[1])\n",
    "cell_median_train = np.median(train_image.sum((1,2,3)))\n",
    "\n",
    "read_dir =f'{data_path}{dataset}/gene2vec_weight_xenium_mouse_brain.npy'\n",
    "gene_names = np.loadtxt(f'{data_path}{dataset}/gene_names.txt', dtype=str)\n",
    "gene_names = gene_names.tolist()\n",
    "# np.save(f'{data_path}output/xenium_mouse_brain/gene_count_sum_rep1.npy',train_image.sum((2,3)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**load the trained model**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shape of gene2vec_weight torch.Size([228, 200])\n"
     ]
    }
   ],
   "source": [
    "ckpt_dir = \"/net/mulan/home/huiwann/spatial/SVC/checkpoints/\"\n",
    "ckpt = torch.load(ckpt_dir +'SVC_xenium_mouse_brain.pth', map_location=device)\n",
    "\n",
    "new_ckpt = {k.replace('module.', ''): v for k, v in ckpt['model_state_dict'].items()}\n",
    "\n",
    "gene2vec_weight = torch.from_numpy(np.load(read_dir)).float() ##n_gene * 200\n",
    "print(\"shape of gene2vec_weight\", gene2vec_weight.shape)\n",
    "\n",
    "model = SVC(\n",
    "    gene2vec_weight = gene2vec_weight,\n",
    "    cell_identity_dim = cell_type_label.shape[1],\n",
    ").to(device)\n",
    "\n",
    "model_dict = model.state_dict()\n",
    "for key in model_dict:\n",
    "    model_dict[key] = new_ckpt[key]\n",
    "model.load_state_dict(model_dict)\n",
    "model.eval()\n",
    "train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = False, num_workers = 4)\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For storage efficiency, cosine similarity values were quantized and saved as uint16 rather than floating-point numbers. Specifically, values were linearly transformed from approximately [-1, 1] to [0, 20000] using\n",
    "$$\n",
    "\\mathrm{scaled} = \\mathrm{cosine\\ similarity} \\times 10000 + 10000\n",
    "$$\n",
    "and then cast to uint16. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "due to the large size of tissue-level subcellular ST data, we can run chunked extraction to avoid memory overflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_cosine_similarity_subset(\n",
    "    model=model,\n",
    "    train_loader=train_loader,\n",
    "    device=device,\n",
    "    cell_median=cell_median,\n",
    "    out_file=f\"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_1.npz\",\n",
    "    batch_start=0,\n",
    "    batch_end=240,\n",
    ")\n",
    "\n",
    "save_cosine_similarity_subset(\n",
    "    model=model,\n",
    "    train_loader=train_loader,\n",
    "    device=device,\n",
    "    cell_median=cell_median,\n",
    "    out_file=f\"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_2.npz\",\n",
    "    batch_start=241,\n",
    "    batch_end=480,\n",
    ")\n",
    "\n",
    "save_cosine_similarity_subset(\n",
    "    model=model,\n",
    "    train_loader=train_loader,\n",
    "    device=device,\n",
    "    cell_median=cell_median,\n",
    "    out_file=f\"{data_path}/output/xenium_mouse_brain/cosine_simi_scaling_rep1_part_3.npz\",\n",
    "    batch_start=481,\n",
    "    batch_end=None,\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "SVC",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}