Trimodel Integration#
[ ]:
%%bash
git clone https://github.com/Zafar-Lab/multiHIVE.git
pip install scvi-tools
pip install scanpy
pip install anndata
pip install scikit-misc
[2]:
import scanpy as sc
import scvi
import sys
sys.path.append("/home/anirudhn/Krushna/ATAC_Support/multiHIVE/src/")
from multiHIVE.model import multiHIVE
import torch
[3]:
scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)
torch.set_float32_matmul_precision("high")
Seed set to 0
Last run with scvi-tools version: 1.3.0
[6]:
adata = sc.read_h5ad( "/home/anirudhn/Krushna/Data/TEA-seq/TEA-seq.h5ad")
adata.var_names_make_unique()
adata
[6]:
AnnData object with n_obs × n_vars = 25517 × 165454
obs: 'cell_type', 'batch'
var: 'modality'
obsm: 'protein_expression'
[7]:
# adata = scvi.data.organize_multiome_anndatas(adata)
adata = adata[:, adata.var["modality"].argsort()].copy()
sc.pp.filter_genes(adata, min_cells=int(adata.shape[0] * 0.01))
multiHIVE.setup_anndata(adata, batch_key="batch", protein_expression_obsm_key = "protein_expression")
adata
INFO Using column names from columns of adata.obsm['protein_expression']
/tmp/ipykernel_554711/4167032945.py:4: DeprecationWarning: multiHIVE is supposed to work with MuData. the use of anndata is deprecated and will be removed in scvi-tools 1.4. Please use setup_mudata
multiHIVE.setup_anndata(adata, batch_key="batch", protein_expression_obsm_key = "protein_expression")
[7]:
AnnData object with n_obs × n_vars = 25517 × 48872
obs: 'cell_type', 'batch', '_indices', '_scvi_batch', '_scvi_labels'
var: 'modality', 'n_cells'
uns: '_scvi_uuid', '_scvi_manager_uuid'
obsm: 'protein_expression'
[8]:
adata.obsm['protein_expression']
[8]:
| CD10 | CD11b | CD11c | CD123 | CD127 | CD14 | CD141 | CD16 | CD172a | CD185 | ... | FceRI | HLA.DR | IgD | IgG1.K.Isotype.Control | IgM | KLRG1 | TCR.Va24.Ja18 | TCR.Va7.2 | TCR.a.b | TCR.g.d | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CAATGTCAGTGAACGA-6 | 2 | 10 | 250 | 32 | 2 | 51 | 1 | 11 | 143 | 1 | ... | 0 | 56 | 4 | 0 | 2 | 6 | 0 | 1 | 16 | 43 |
| GGATATTGTTTGTGGA-6 | 1 | 31 | 511 | 19 | 3 | 149 | 13 | 4 | 254 | 0 | ... | 2 | 122 | 3 | 1 | 21 | 7 | 1 | 0 | 21 | 78 |
| AGGCCCAGTGTCCAAA-6 | 2 | 11 | 403 | 9 | 0 | 70 | 34 | 5 | 279 | 1 | ... | 13 | 189 | 9 | 1 | 4 | 18 | 1 | 1 | 15 | 118 |
| CTCCTGAGTTGCACAA-6 | 1 | 10 | 372 | 11 | 1 | 84 | 16 | 4 | 165 | 2 | ... | 2 | 62 | 4 | 0 | 1 | 12 | 1 | 0 | 13 | 34 |
| ATTAGCGGTTAGGCTA-6 | 5 | 8 | 207 | 43 | 4 | 57 | 13 | 14 | 133 | 7 | ... | 8 | 211 | 2 | 10 | 2 | 65 | 5 | 1 | 21 | 203 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| GGAGCGATCGGTCATG-3 | 1 | 0 | 6 | 23 | 0 | 10 | 3 | 3 | 1 | 3 | ... | 3 | 131 | 35 | 0 | 1 | 8 | 2 | 0 | 6 | 22 |
| CCGTTACTCATTTAGG-3 | 1 | 1 | 22 | 1 | 4 | 7 | 1 | 75 | 4 | 1 | ... | 0 | 5 | 4 | 0 | 1 | 40 | 0 | 0 | 14 | 21 |
| CAGGATGCACTCGCTC-3 | 1 | 3 | 636 | 76 | 2 | 10 | 32 | 193 | 283 | 2 | ... | 0 | 61 | 2 | 0 | 3 | 14 | 6 | 0 | 13 | 40 |
| GGTCAAGCAGCTAATT-3 | 1 | 4 | 84 | 0 | 3 | 3 | 5 | 271 | 10 | 0 | ... | 2 | 13 | 3 | 0 | 1 | 8 | 1 | 0 | 16 | 21 |
| CATTGTAAGCGCTAAT-3 | 5 | 1 | 8 | 34 | 3 | 6 | 1 | 10 | 2 | 3 | ... | 1 | 370 | 79 | 0 | 17 | 15 | 35 | 0 | 23 | 87 |
25517 rows × 46 columns
[ ]:
vae = multiHIVE(adata, latent_distribution="normal",
n_genes=(adata.var["modality"] == "Gene Expression").sum(),
n_regions=(adata.var["modality"] == "Peaks").sum(),
n_proteins=46
)
vae.train()
# generated_data = vae.posterior_predictive_sample(adata, swap_latent=False)
# rna_sample = generated_data[:,:hvg].copy()
# proteins_sample = pd.DataFrame(generated_data[:,hvg:], index= adata.obs_names, columns = adata.obsm['protein_counts'].columns)
# adata.obsm['RNA_Z1_denoised'] = rna_sample
# adata.obsm['protein_Z1_denoised'] = proteins_sample
# generated_data = vae.posterior_predictive_sample(adata, swap_latent=True)
# rna_sample = generated_data[:,:hvg].copy()
# proteins_sample = pd.DataFrame(generated_data[:,hvg:], index= adata.obs_names, columns = adata.obsm['protein_counts'].columns)
# adata.obsm['RNA_Z2_denoised'] = rna_sample
# adata.obsm['protein_Z2_denoised'] = proteins_sample
[ ]:
vae.get_latent_representation()
[ ]:
vae.save("./outputs/tea-seq/saved_model/", save_anndata=True, overwrite = True)
[ ]:
sc.pp.neighbors(adata, use_rep = "Z_multiHIVE")
sc.tl.umap(adata)
sc.pl.umap(adata,color=['cell_type', 'batch'])