Trimodel Integration

Trimodel Integration#

[ ]:
%%bash
git clone https://github.com/Zafar-Lab/multiHIVE.git
pip install scvi-tools
pip install scanpy
pip install anndata
pip install scikit-misc
[2]:
import scanpy as sc
import scvi
import sys
sys.path.append("/home/anirudhn/Krushna/ATAC_Support/multiHIVE/src/")
from multiHIVE.model import multiHIVE
import torch
[3]:
scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)
torch.set_float32_matmul_precision("high")
Seed set to 0
Last run with scvi-tools version: 1.3.0
[6]:
adata = sc.read_h5ad( "/home/anirudhn/Krushna/Data/TEA-seq/TEA-seq.h5ad")
adata.var_names_make_unique()
adata
[6]:
AnnData object with n_obs × n_vars = 25517 × 165454
    obs: 'cell_type', 'batch'
    var: 'modality'
    obsm: 'protein_expression'
[7]:
# adata = scvi.data.organize_multiome_anndatas(adata)
adata = adata[:, adata.var["modality"].argsort()].copy()
sc.pp.filter_genes(adata, min_cells=int(adata.shape[0] * 0.01))
multiHIVE.setup_anndata(adata, batch_key="batch", protein_expression_obsm_key = "protein_expression")
adata
INFO     Using column names from columns of adata.obsm['protein_expression']
/tmp/ipykernel_554711/4167032945.py:4: DeprecationWarning: multiHIVE is supposed to work with MuData. the use of anndata is deprecated and will be removed in scvi-tools 1.4. Please use setup_mudata
  multiHIVE.setup_anndata(adata, batch_key="batch", protein_expression_obsm_key = "protein_expression")
[7]:
AnnData object with n_obs × n_vars = 25517 × 48872
    obs: 'cell_type', 'batch', '_indices', '_scvi_batch', '_scvi_labels'
    var: 'modality', 'n_cells'
    uns: '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'protein_expression'
[8]:
adata.obsm['protein_expression']
[8]:
CD10 CD11b CD11c CD123 CD127 CD14 CD141 CD16 CD172a CD185 ... FceRI HLA.DR IgD IgG1.K.Isotype.Control IgM KLRG1 TCR.Va24.Ja18 TCR.Va7.2 TCR.a.b TCR.g.d
CAATGTCAGTGAACGA-6 2 10 250 32 2 51 1 11 143 1 ... 0 56 4 0 2 6 0 1 16 43
GGATATTGTTTGTGGA-6 1 31 511 19 3 149 13 4 254 0 ... 2 122 3 1 21 7 1 0 21 78
AGGCCCAGTGTCCAAA-6 2 11 403 9 0 70 34 5 279 1 ... 13 189 9 1 4 18 1 1 15 118
CTCCTGAGTTGCACAA-6 1 10 372 11 1 84 16 4 165 2 ... 2 62 4 0 1 12 1 0 13 34
ATTAGCGGTTAGGCTA-6 5 8 207 43 4 57 13 14 133 7 ... 8 211 2 10 2 65 5 1 21 203
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
GGAGCGATCGGTCATG-3 1 0 6 23 0 10 3 3 1 3 ... 3 131 35 0 1 8 2 0 6 22
CCGTTACTCATTTAGG-3 1 1 22 1 4 7 1 75 4 1 ... 0 5 4 0 1 40 0 0 14 21
CAGGATGCACTCGCTC-3 1 3 636 76 2 10 32 193 283 2 ... 0 61 2 0 3 14 6 0 13 40
GGTCAAGCAGCTAATT-3 1 4 84 0 3 3 5 271 10 0 ... 2 13 3 0 1 8 1 0 16 21
CATTGTAAGCGCTAAT-3 5 1 8 34 3 6 1 10 2 3 ... 1 370 79 0 17 15 35 0 23 87

25517 rows × 46 columns

[ ]:
vae = multiHIVE(adata, latent_distribution="normal",
                n_genes=(adata.var["modality"] == "Gene Expression").sum(),
                n_regions=(adata.var["modality"] == "Peaks").sum(),
                n_proteins=46
               )
vae.train()

# generated_data = vae.posterior_predictive_sample(adata, swap_latent=False)
# rna_sample = generated_data[:,:hvg].copy()
# proteins_sample = pd.DataFrame(generated_data[:,hvg:],  index= adata.obs_names, columns = adata.obsm['protein_counts'].columns)
# adata.obsm['RNA_Z1_denoised'] = rna_sample
# adata.obsm['protein_Z1_denoised'] = proteins_sample

# generated_data = vae.posterior_predictive_sample(adata, swap_latent=True)
# rna_sample = generated_data[:,:hvg].copy()
# proteins_sample = pd.DataFrame(generated_data[:,hvg:],  index= adata.obs_names, columns = adata.obsm['protein_counts'].columns)
# adata.obsm['RNA_Z2_denoised'] = rna_sample
# adata.obsm['protein_Z2_denoised'] = proteins_sample
[ ]:
vae.get_latent_representation()
[ ]:
vae.save("./outputs/tea-seq/saved_model/", save_anndata=True, overwrite = True)
[ ]:
sc.pp.neighbors(adata, use_rep = "Z_multiHIVE")
sc.tl.umap(adata)
sc.pl.umap(adata,color=['cell_type', 'batch'])