Data preprocessing

In this tutorial, we demonstrate how to prepare the required inputs for training the SVC model, using seqFISH+ data from Eng et al. (2019) as an example.

Preprocessed datasets used in our project that are ready to use can be downloaded from: https://drive.google.com/drive/folders/1G4Nl-As6hXsAh7vHaRJ3vkYv44A-mU5f

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import scanpy as sc
from model.utils import *
import natsort
import matplotlib.pyplot as plt

load subcellular ST data

  • x y: Cartesian coordinates of a measured transcript within the cell

  • gene: Gene name of the transcript

  • cell: Cell ID that the transcript belongs to

  • centerX  centerY: Nuclear center coordinates for the corresponding cell

data_path = './SVC/' 
dataset = 'data/seqfish' 
data = pd.read_pickle(f"{data_path}{dataset}/seqfish_data_dict.pkl")
seqfish_data = data['data_df']
print(seqfish_data.head())
             x           y           gene cell nucleus batch  umi  centerX  \
0  1217.437557  557.583252  4933401b06rik  5-0      -1     0    1     1003   
1  1096.190309  394.835294  4933401b06rik  5-0       5     0    1     1003   
2  1093.189494  572.832405  4933401b06rik  5-0      -1     0    1     1003   
3  1005.120220  297.196271  4933401b06rik  5-0      -1     0    1     1003   
4  1142.815026  378.376491  4933401b06rik  5-0      -1     0    1     1003   

   centerY        type  sc_total  
0      425  fibroblast     32224  
1      425  fibroblast     32224  
2      425  fibroblast     32224  
3      425  fibroblast     32224  
4      425  fibroblast     32224  

Gene2vec ortholog mapping and gene filtering

For mouse data, we mapped the 16,906 Gene2vec genes to mouse orthologs and preserve the common genes.

ref_gene_names = np.loadtxt(f"{data_path}/data/gene2vec/convert_mmu_gene2vec_names.txt", dtype=str).tolist() ## for human data, use gene2vec/gene2vec_names.txt instead
common= list(set(np.array(list(data['gene_list_dict'].values()))[0]).intersection(set(ref_gene_names)))
print("common genes:", len(common))
common genes: 2659

We then selected the top 1,000 most highly expressed genes for model training and downstream analyses.

seqfish_data = seqfish_data[seqfish_data.gene.isin(common)]
high_exp_gene = seqfish_data.groupby(['gene']).count()['x'].sort_values(ascending=False)[:1000]
high_exp_gene_id = np.array(high_exp_gene.index)
gene_names = natsort.natsorted(high_exp_gene_id)
seqfish_data = seqfish_data[seqfish_data.gene.isin(gene_names)]
print("cell number:",seqfish_data.cell.unique().shape[0]) 
print("gene number:",seqfish_data.gene.unique().shape[0])
# np.savetxt(f"{data_path}{dataset}/gene_names.txt", gene_names, fmt='%s')
cell_names = natsort.natsorted(seqfish_data['cell'].unique().tolist())
# np.savetxt(f"{data_path}{dataset}/cell_names.txt", cell_names, fmt='%s')
cell number: 171
gene number: 1000

save the corresponding Gene2Vec vectors

chosen_indices_ref = []
for i in range(len(gene_names)):
    if gene_names[i] in ref_gene_names:
        index_position = ref_gene_names.index(gene_names[i])
        chosen_indices_ref.append(index_position)
gene2vec_weight = np.load(f'{data_path}/data/gene2vec/gene2vec_16906.npy')
print("gene2vec_weight:", gene2vec_weight.shape)
gene2vec_weight_filtered = gene2vec_weight[chosen_indices_ref]
print("filtered gene2vec_weight:", gene2vec_weight_filtered.shape)
# np.save(f"{data_path}{dataset}/gene2vec_weight_seqfish.npy", gene2vec_weight_filtered)
gene2vec_weight: (16906, 200)
filtered gene2vec_weight: (1000, 200)

load cell boundary information for each cell

  • direction_vec: the angle of transcript relative to the positive x-axis, with the nuclear center as the origin.

  • distance_to_center: Euclidean distance between the cell boundary point and nuclear center

cell_mask_contour = pd.read_pickle(f"{data_path}{dataset}/cell_mask_contour_preprocessed.pkl")
print(cell_mask_contour.head())
  cell    x    y  centerX  centerY  direction_vec  distance_to_center
0  0-0  521  496     1079      724         -158.0          602.783543
1  0-0  519  496     1079      724         -158.0          604.635427
2  0-0  517  494     1079      724         -157.5          607.242950
3  0-0  515  494     1079      724         -158.0          609.094410
4  0-0  513  492     1079      724         -157.5          611.702542

Conduct cell registration procedure

seqfish_data = register_original_data(seqfish_data, cell_mask_contour)
100%|██████████| 171/171 [07:26<00:00,  2.61s/it]
              x           y  gene cell nucleus batch  umi  centerX  centerY  \
10   828.109167  457.736354  Aatf  5-0      -1     0    1     1003      425   
11  1128.101035  488.990636  Aatf  5-0      -1     0    1     1003      425   
12  1039.856556  294.494083  Aatf  5-0      -1     0    1     1003      425   
13  1182.850103  461.492268  Aatf  5-0      -1     0    1     1003      425   
14   891.607843  440.987851  Aatf  5-0       5     0    1     1003      425   

          type  sc_total  distance_to_center  direction_vec     ratio  \
10  fibroblast     32224          177.928278          169.5  0.667541   
11  fibroblast     32224          140.517153           27.0  0.425739   
12  fibroblast     32224          135.610472          -74.0  0.802344   
13  fibroblast     32224          183.514973           11.5  0.668383   
14  fibroblast     32224          112.533657          172.0  0.409745   

    angle_radians    x_norm    y_norm  
10       2.958333 -0.656363  0.121650  
11       0.471239  0.379336  0.193281  
12      -1.291544  0.221156 -0.771263  
13       0.200713  0.654965  0.133254  
14       3.001966 -0.405758  0.057026  
fig, axs = plt.subplots(1, 2, figsize=(12, 6)) 
cell_mask_contour_i = cell_mask_contour[(cell_mask_contour.cell == '5-0')]
seqfish_data_i = seqfish_data[(seqfish_data.cell == '5-0')]

axs[0].scatter(cell_mask_contour_i['x'], cell_mask_contour_i['y'], alpha=0.6, s=1, color='black')
axs[0].scatter(seqfish_data_i[(seqfish_data_i.gene == 'Aamp')]['x'], seqfish_data_i[(seqfish_data_i.gene == 'Aamp')]['y'], label='Original Points', alpha=0.6, s=5, color='red')
axs[0].scatter(cell_mask_contour_i['centerX'][0],cell_mask_contour_i['centerY'][0],color='green',s=100,marker='+')
axs[0].axis('equal')
axs[0].set_title('Original') 

axs[1].scatter(seqfish_data_i[(seqfish_data_i.gene == 'Aamp')]['x_norm'], seqfish_data_i[(seqfish_data_i.gene == 'Aamp')]['y_norm'], label='Registered Points', alpha=0.6, s=5, color='red')
circle = plt.Circle((0, 0), 1, fill=False)  
axs[1].add_artist(circle) 
axs[1].set_title('Registered') 
axs[1].scatter(0,0,color='green',s=100,marker='+')
axs[1].axis('equal')
axs[1].set_xlim(-1.5, 1.5)
axs[1].set_ylim(-1.5, 1.5)
plt.tight_layout()

plt.show()
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed y limits to fulfill fixed data aspect with adjustable data limits.
../_images/aeb4365cd63e45a521e67e732d2cc3cfa2a8c18400f2269c3be2129cc48b046f.png

Represent the subcellular spatial expression of each cell and gene as a 48×48-pixel square image, and conduct resolution reduction

map_all_genes = get_gene_map(seqfish_data, cell_names, gene_names)
======> Loading transcripts file
1000 unique genes
171 unique cells
Converting to expression maps
100%|██████████| 171/171 [17:25<00:00,  6.12s/it]
print("map_all_genes shape:", map_all_genes.shape) 
# np.savez_compressed(f"{data_path}{dataset}/cell_gene_map.npz", image = map_all_genes)

map_all_genes_low_res = map_all_genes.reshape(map_all_genes.shape[0], 12, 4, 12, 4,map_all_genes.shape[-1]).sum(axis=(2, 4))
print(map_all_genes_low_res.shape)
map_all_genes_low_res = np.transpose(map_all_genes_low_res, (0,3,1,2))
print("map_all_genes_low_res shape:", map_all_genes_low_res.shape)
# np.savez_compressed("f"{data_path}{dataset}/cell_gene_map_low_res.npz", image = map_all_genes_low_res)
map_all_genes shape: (171, 48, 48, 1000)
(171, 12, 12, 1000)
map_all_genes shape: (171, 1000, 12, 12)
fig, ax = plt.subplots(3, 3, figsize=(10, 5))
cell_id = [0,1,2]
i = 0
for cell in cell_id:
    cell_gene = map_all_genes_low_res[cell]
    cell_gene_sum = cell_gene.sum(axis = 0)
    counts = np.sum(cell_gene, axis=(0, 1))
    ax[i,0].imshow(cell_gene_sum, cmap='gray')
    ax[i,0].set_title("Total gene Map: Cell " + str(cell))
    ax[i,0].invert_yaxis()
    ax[i,1].imshow(cell_gene[gene_names.index('Bgn'),:,:,], cmap='gray')
    ax[i,1].set_title("Cell: " + cell_names[cell] + ", Gene: " + 'Bgn')
    ax[i,1].invert_yaxis()
    ax[i,2].imshow(cell_gene[gene_names.index('Col1a1'),:,:,], cmap='gray')
    ax[i,2].set_title("Cell: " + cell_names[cell] + ", Gene: " + 'Col1a1')
    ax[i,2].invert_yaxis()
    i+=1
plt.tight_layout()

plt.show()
plt.close()
../_images/54a231eebf1fa9a9a8b07431f55fa6e8227fd316470bcf43f4d44fbadd48c5a1.png