import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib as mpl
import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white', color_map='viridis')
sc.logging.print_header()

results_file = 'pbmc10k_scanpy.h5ad'

scanpy==1.7.1 anndata==0.7.5 umap==0.4.6 numpy==1.20.1 scipy==1.6.1 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3


adata = sc.read_10x_mtx('../data/soupX_pbmc10k_filt', cache = True)
adata

... writing an h5ad cache file to speedup reading next time

AnnData object with n_obs × n_vars = 10194 × 36601
    var: 'gene_ids', 'feature_types'


adata.var_names_make_unique()
adata
adata.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.5', 'AL627309.4', 'AP006222.2', 'AL732372.1',
       ...
       'AC133551.1', 'AC136612.1', 'AC136616.1', 'AC136616.3', 'AC136616.2',
       'AC141272.1', 'AC023491.2', 'AC007325.1', 'AC007325.4', 'AC007325.2'],
      dtype='object', length=36601)


adata.obs.head()


print(adata.obs_names[10189:10194].tolist())
print()
print(adata.obs_names[-5:].tolist())
print()
print(adata.var_names[2000:2010].tolist())

['TTTGTTGGTCCGTACG-1', 'TTTGTTGGTCGGAACA-1', 'TTTGTTGGTTTACGAC-1', 'TTTGTTGTCGTTATCT-1', 'TTTGTTGTCTTTGCTA-1']

['TTTGTTGGTCCGTACG-1', 'TTTGTTGGTCGGAACA-1', 'TTTGTTGGTTTACGAC-1', 'TTTGTTGTCGTTATCT-1', 'TTTGTTGTCTTTGCTA-1']

['THEM5', 'THEM4', 'AL450992.2', 'AL450992.3', 'S100A10', 'AL450992.1', 'S100A11', 'TCHHL1', 'TCHH', 'AL589986.1']


sc.pp.filter_genes(adata, min_cells=3)
adata

filtered out 14591 genes that are detected in less than 3 cells

AnnData object with n_obs × n_vars = 10194 × 22010
    var: 'gene_ids', 'feature_types', 'n_cells'


sc.pl.highest_expr_genes(adata, n_top=20, )

normalizing counts per cell
    finished (0:00:00)


adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
adata.var['rb'] = adata.var_names.str.startswith(('RPS','RPL'))  # annotate the group of ribosomal proteins as 'rb'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata, qc_vars=['rb'], percent_top=None, log1p=False, inplace=True)


sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_rb'], 
             jitter=0.4, multi_panel=True)

... storing 'feature_types' as categorical


adata.obs.head()


doublets = pd.read_csv("../data/scrublet_calls.tsv", delimiter = '\t', 
                       header = None, index_col = 0,names = ['Doublet_score','Is_doublet'])
doublets.shape

(10194, 2)


doublets.head()


adata.obs = adata.obs.merge(doublets, left_index=True, right_index=True)
adata.obs.tail()


sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')


sc.pl.scatter(adata, x='total_counts', y='pct_counts_rb')
sc.pl.scatter(adata, x='pct_counts_rb', y='pct_counts_mt')


sc.pl.scatter(adata, x='n_genes_by_counts', y='Doublet_score')


adata.obs.head()


adata.obs.dtypes

n_genes_by_counts      int32
total_counts         float32
total_counts_mt      float32
pct_counts_mt        float32
total_counts_rb      float32
pct_counts_rb        float32
Doublet_score        float64
Is_doublet              bool
dtype: object


conditions = [
    (adata.obs['Is_doublet'] == True),
    (adata.obs['n_genes_by_counts'] < 500),
    (adata.obs['pct_counts_mt'] > 15),
    (adata.obs['pct_counts_mt'] <= 15) & (adata.obs['n_genes_by_counts'] >= 500) & (adata.obs['Is_doublet'] != True)
]

values = ['Doublet', 'Low_nFeature', 'High_MT', 'Pass']
adata.obs['QC'] = np.select(conditions, values)
adata.obs['QC'] = adata.obs['QC'].astype('category')
adata.obs[['Is_doublet','n_genes_by_counts','pct_counts_mt','QC']].head(10)


adata.obs['QC'].value_counts()

Pass            8824
High_MT          548
Doublet          547
Low_nFeature     275
Name: QC, dtype: int64


sc.pl.violin(adata[adata.obs['QC'] == 'Pass'], ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_rb'], 
             jitter=0.4, multi_panel=True)


sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata

normalizing counts per cell
    finished (0:00:00)

AnnData object with n_obs × n_vars = 10194 × 22010
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'log1p'


adata.var.head()


sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


sc.pl.highly_variable_genes(adata)


adata.var.highly_variable.value_counts()

False    19140
True      2870
Name: highly_variable, dtype: int64


adata.raw = adata


adata = adata[:, adata.var.highly_variable]
adata

View of AnnData object with n_obs × n_vars = 10194 × 2870
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'


sc.pp.scale(adata, max_value=10)

/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_simple.py:810: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption


sc.tl.pca(adata, svd_solver='arpack')

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:12)


sc.pl.pca_scatter(adata, color='CST3')


sc.pl.pca_variance_ratio(adata, log=True)


sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50)

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:15)


sc.tl.umap(adata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)


sc.pl.umap(adata, color=['LYZ','MS4A1','CD3D'])


sc.tl.leiden(adata)

running Leiden clustering
    finished: found 20 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:01)


sc.pl.umap(adata, color=['leiden'])


s_genes = ['MCM5','PCNA','TYMS','FEN1','MCM7','MCM4','RRM1','UNG','GINS2','MCM6','CDCA7','DTL','PRIM1',
           'UHRF1','CENPU','HELLS','RFC2','POLR1B','NASP','RAD51AP1','GMNN','WDR76','SLBP','CCNE2','UBR7',
           'POLD3','MSH2','ATAD2','RAD51','RRM2','CDC45','CDC6','EXO1','TIPIN','DSCC1','BLM','CASP8AP2',
           'USP1','CLSPN','POLA1','CHAF1B','MRPL36','E2F8']
g2m_genes = ['HMGB2','CDK1','NUSAP1','UBE2C','BIRC5','TPX2','TOP2A','NDC80','CKS2','NUF2','CKS1B',
             'MKI67','TMPO','CENPF','TACC3','PIMREG','SMC4','CCNB2','CKAP2L','CKAP2','AURKB','BUB1',
             'KIF11','ANP32E','TUBB4B','GTSE1','KIF20B','HJURP','CDCA3','JPT1','CDC20','TTK','CDC25C',
             'KIF2C','RANGAP1','NCAPD2','DLGAP5','CDCA2','CDCA8','ECT2','KIF23','HMMR','AURKA','PSRC1',
             'ANLN','LBR','CKAP5','CENPE','CTCF','NEK2','G2E3','GAS2L3','CBX5','CENPA']
cell_cycle_genes = s_genes + g2m_genes
display(len(s_genes))
display(len(g2m_genes))
display(len(cell_cycle_genes))

43

54

97


adata = adata.raw.to_adata()
adata

AnnData object with n_obs × n_vars = 10194 × 22010
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors'
    obsm: 'X_pca', 'X_umap'


cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
len(cell_cycle_genes)

97


sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

calculating cell cycle phase
computing score 'S_score'
    finished: added
    'S_score', score of gene set (adata.obs).
    727 total control genes are used. (0:00:00)
computing score 'G2M_score'
    finished: added
    'G2M_score', score of gene set (adata.obs).
    815 total control genes are used. (0:00:00)
-->     'phase', cell cycle phase (adata.obs)


sc.pl.umap(adata, color=['Doublet_score','pct_counts_mt','n_genes_by_counts'])

... storing 'phase' as categorical


adata = adata[adata.obs['QC'] == 'Pass']
adata

View of AnnData object with n_obs × n_vars = 8824 × 22010
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden', 'S_score', 'G2M_score', 'phase'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors'
    obsm: 'X_pca', 'X_umap'


sc.pl.umap(adata, color=['Doublet_score','pct_counts_mt','n_genes_by_counts'])


sc.pl.umap(adata, color=['pct_counts_rb','pct_counts_mt','leiden'])


with mpl.rc_context({'figure.figsize': (5.5, 4)}):
    sc.pl.violin(adata, ['pct_counts_rb', 'pct_counts_mt'], groupby='leiden', stripplot = False )


sc.pl.umap(adata, color=['S_score','G2M_score','leiden'])


with mpl.rc_context({'figure.figsize': (5.5, 4)}):
    sc.pl.violin(adata, ['S_score','G2M_score'], groupby = 'leiden', stripplot = False)


sc.pl.umap(adata, color=['n_genes_by_counts','total_counts','leiden'])


with mpl.rc_context({'figure.figsize': (5.5, 4)}):
    sc.pl.violin(adata, ['n_genes_by_counts','total_counts'], groupby = 'leiden', stripplot = False)


adata

View of AnnData object with n_obs × n_vars = 8824 × 22010
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden', 'S_score', 'G2M_score', 'phase'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors'
    obsm: 'X_pca', 'X_umap'


sc.pp.highly_variable_genes(adata, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
adata.var.highly_variable.value_counts()

extracting highly variable genes
    finished (0:00:00)
Trying to set attribute `.uns` of view, copying.
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

False    20005
True      2005
Name: highly_variable, dtype: int64


adata.raw = adata
adata = adata[:, adata.var.highly_variable]
adata

View of AnnData object with n_obs × n_vars = 8824 × 2005
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden', 'S_score', 'G2M_score', 'phase'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors'
    obsm: 'X_pca', 'X_umap'


sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt', 'S_score', 'G2M_score'])

regressing out ['total_counts', 'pct_counts_mt', 'S_score', 'G2M_score']
    sparse input is densified and may lead to high memory use
    finished (0:03:46)


sc.pp.scale(adata, max_value=10)


sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='CST3')

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:07)


sc.pl.pca_variance_ratio(adata, log=True)


sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata)
sc.tl.leiden(adata)

computing neighbors
    using 'X_pca' with n_pcs = 30
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)
running Leiden clustering
    finished: found 17 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)


sc.tl.leiden(adata,resolution = 0.8)

running Leiden clustering
    finished: found 16 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)


with mpl.rc_context({'figure.figsize': (8, 8)}):
    sc.pl.umap(adata, color=['leiden'],legend_loc='on data')


sc.tl.paga(adata)
sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(adata, init_pos='paga')

running PAGA
    finished: added
    'paga/connectivities', connectivities adjacency (adata.uns)
    'paga/connectivities_tree', connectivities subtree (adata.uns) (0:00:00)
--> added 'pos', the PAGA positions (adata.uns['paga'])
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:06)


with mpl.rc_context({'figure.figsize': (8, 8)}):
    sc.pl.umap(adata, color=['leiden'],legend_loc='on data')


sc.pl.umap(adata, color=['LYZ','MS4A1','CD8B','CCR7','IL32','NKG7','LILRA4','FCER1A','PPBP'], ncols = 3)


sc.tl.leiden(adata, resolution = 1.2, key_added = "leiden_1.2")
sc.tl.leiden(adata, resolution = 1.5, key_added = "leiden_1.5")
sc.tl.leiden(adata, resolution = 2.0, key_added = "leiden_2.0")

running Leiden clustering
    finished: found 18 clusters and added
    'leiden_1.2', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
    finished: found 22 clusters and added
    'leiden_1.5', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
    finished: found 28 clusters and added
    'leiden_2.0', the cluster labels (adata.obs, categorical) (0:00:00)


sc.pl.umap(adata, color=['leiden_1.2', 'leiden_1.5', 'leiden_2.0'], legend_loc='on data', wspace = 0.25, legend_fontsize=10)


from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
X_pca = adata.obsm['X_pca'] 

kmeans = KMeans(n_clusters=15, random_state=0).fit(X_pca) 
adata.obs['kmeans15'] = kmeans.labels_.astype(str)
kmeans = KMeans(n_clusters=20, random_state=0).fit(X_pca) 
adata.obs['kmeans20'] = kmeans.labels_.astype(str)
kmeans = KMeans(n_clusters=25, random_state=0).fit(X_pca) 
adata.obs['kmeans25'] = kmeans.labels_.astype(str)

sc.pl.umap(adata, color=['kmeans15', 'kmeans20', 'kmeans25'], legend_loc='on data', wspace = 0.25, legend_fontsize=10)

... storing 'kmeans15' as categorical
... storing 'kmeans20' as categorical
... storing 'kmeans25' as categorical


from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=15, affinity='euclidean', linkage='ward')
adata.obs['hclust15'] = cluster.fit_predict(X_pca).astype(str)

cluster = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward')
adata.obs['hclust20'] = cluster.fit_predict(X_pca).astype(str)

cluster = AgglomerativeClustering(n_clusters=25, affinity='euclidean', linkage='ward')
adata.obs['hclust25'] = cluster.fit_predict(X_pca).astype(str)

sc.pl.umap(adata, color=['hclust15', 'hclust20', 'hclust25'], legend_loc='on data', wspace = 0.25, legend_fontsize=10)

... storing 'hclust15' as categorical
... storing 'hclust20' as categorical
... storing 'hclust25' as categorical


adata.obs['leiden'].value_counts()

0     1438
1     1221
2     1155
3     1033
4      900
5      754
6      529
7      379
8      358
9      327
10     267
11     134
12     120
13     105
14      87
15      17
Name: leiden, dtype: int64


sc.pl.umap(adata, color=['PPBP','FCER1A','leiden'])


platelets = adata[(adata[:,'GP1BB'].X > 1) & (adata[:,'PPBP'].X > 1) & (adata[:,'PF4'].X > 1), :]
platelets

View of AnnData object with n_obs × n_vars = 31 × 2005
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden', 'S_score', 'G2M_score', 'phase', 'leiden_1.2', 'leiden_1.5', 'leiden_2.0', 'kmeans15', 'kmeans20', 'kmeans25', 'hclust15', 'hclust20', 'hclust25'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'paga', 'leiden_sizes', 'leiden_1.2_colors', 'leiden_1.5_colors', 'leiden_2.0_colors', 'kmeans15_colors', 'kmeans20_colors', 'kmeans25_colors', 'hclust15_colors', 'hclust20_colors', 'hclust25_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'


platelets.obs_names

Index(['AACAGGGAGTACAGCG-1', 'ACCTACCCACATACTG-1', 'ACTGTCCTCGCAGTGC-1',
       'AGCCAATTCCGGGACT-1', 'AGGATCTTCACATACG-1', 'AGTCATGCATTGCCGG-1',
       'ATACTTCAGCAGTAAT-1', 'CACACAATCTCTCAAT-1', 'CACGAATAGGGCATGT-1',
       'CACGTGGGTATAATGG-1', 'CAGATTGGTCAGATTC-1', 'CATCAAGTCGTTCATT-1',
       'CATCCCAGTAAGTCAA-1', 'CATGCGGAGTCCGCCA-1', 'CCGGACATCAATCCAG-1',
       'CCGTGAGTCTGCGATA-1', 'CGGACACAGTGGTGAC-1', 'GACTCTCCATTAGGAA-1',
       'GATCCCTAGCGGTAGT-1', 'GTCACGGGTCGAATTC-1', 'GTCAGCGTCGCTACAA-1',
       'GTGAGTTCAATTCTCT-1', 'GTTACGAGTTCAGTAC-1', 'GTTAGTGTCCCATTTA-1',
       'TACGTCCTCTGGGCGT-1', 'TAGATCGTCCGCATAA-1', 'TCCGATCGTACTCGAT-1',
       'TCGAACAAGCTGCCAC-1', 'TCTGTCGGTCTGGTTA-1', 'TGTAAGCAGAAACTCA-1',
       'TTGTGTTTCCCGTGTT-1'],
      dtype='object')


adata.obs['leiden'] = adata.obs['leiden'].cat.add_categories('16')
adata.obs.loc[lambda df: df.index.isin(platelets.obs.index),'leiden'] = '16'


with mpl.rc_context({'figure.figsize': (8, 8)}):
    sc.pl.umap(adata, color=['leiden'],legend_loc='on data')


adata

AnnData object with n_obs × n_vars = 8824 × 2005
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden', 'S_score', 'G2M_score', 'phase', 'leiden_1.2', 'leiden_1.5', 'leiden_2.0', 'kmeans15', 'kmeans20', 'kmeans25', 'hclust15', 'hclust20', 'hclust25'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'paga', 'leiden_sizes', 'leiden_1.2_colors', 'leiden_1.5_colors', 'leiden_2.0_colors', 'kmeans15_colors', 'kmeans20_colors', 'kmeans25_colors', 'hclust15_colors', 'hclust20_colors', 'hclust25_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'


sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=15, sharey=False)

ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:14)


marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
                'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1', 'IGLC1','IL32',
                'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP','LILRA4']


pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)


with mpl.rc_context({'figure.figsize': (12, 4)}):
    sc.pl.rank_genes_groups_violin(adata, groups='16', n_genes=20)


sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden', stripplot = False )


sc.pl.dotplot(adata, marker_genes, groupby='leiden')


sc.pl.stacked_violin(adata, marker_genes, groupby='leiden', rotation=90)


! sudo apt-get -y install libgraphviz-dev
! sudo apt-get -y install graphviz
! pip install --no-input -U cello-classify --no-cache

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libcdt5 libcgraph6 libgts-0.7-5 libgts-bin libgvc6 libgvc6-plugins-gtk
  libgvpr2 liblab-gamut1 libpathplan4 libxdot4
The following NEW packages will be installed:
  libcdt5 libcgraph6 libgraphviz-dev libgts-0.7-5 libgts-bin libgvc6
  libgvc6-plugins-gtk libgvpr2 liblab-gamut1 libpathplan4 libxdot4
0 upgraded, 11 newly installed, 0 to remove and 6 not upgraded.
Need to get 1,357 kB of archives.
After this operation, 6,400 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 libcdt5 amd64 2.42.2-3build2 [18.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 libcgraph6 amd64 2.42.2-3build2 [41.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 libgts-0.7-5 amd64 0.7.6+darcs121130-4 [150 kB]
Get:4 http://archive.ubuntu.com/ubuntu focal/universe amd64 libpathplan4 amd64 2.42.2-3build2 [21.9 kB]
Get:5 http://archive.ubuntu.com/ubuntu focal/universe amd64 libgvc6 amd64 2.42.2-3build2 [647 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal/universe amd64 libgvpr2 amd64 2.42.2-3build2 [167 kB]
Get:7 http://archive.ubuntu.com/ubuntu focal/universe amd64 libxdot4 amd64 2.42.2-3build2 [15.4 kB]
Get:8 http://archive.ubuntu.com/ubuntu focal/universe amd64 liblab-gamut1 amd64 2.42.2-3build2 [177 kB]
Get:9 http://archive.ubuntu.com/ubuntu focal/universe amd64 libgvc6-plugins-gtk amd64 2.42.2-3build2 [20.6 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal/universe amd64 libgraphviz-dev amd64 2.42.2-3build2 [57.2 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal/universe amd64 libgts-bin amd64 0.7.6+darcs121130-4 [41.3 kB]
Fetched 1,357 kB in 0s (23.1 MB/s)      
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package libcdt5:amd64.
(Reading database ... 80162 files and directories currently installed.)
Preparing to unpack .../00-libcdt5_2.42.2-3build2_amd64.deb ...
Unpacking libcdt5:amd64 (2.42.2-3build2) ...
Selecting previously unselected package libcgraph6:amd64.
Preparing to unpack .../01-libcgraph6_2.42.2-3build2_amd64.deb ...
Unpacking libcgraph6:amd64 (2.42.2-3build2) ...
Selecting previously unselected package libgts-0.7-5:amd64.
Preparing to unpack .../02-libgts-0.7-5_0.7.6+darcs121130-4_amd64.deb ...
Unpacking libgts-0.7-5:amd64 (0.7.6+darcs121130-4) ...
Selecting previously unselected package libpathplan4:amd64.
Preparing to unpack .../03-libpathplan4_2.42.2-3build2_amd64.deb ...
Unpacking libpathplan4:amd64 (2.42.2-3build2) ...
Selecting previously unselected package libgvc6.
Preparing to unpack .../04-libgvc6_2.42.2-3build2_amd64.deb ...
Unpacking libgvc6 (2.42.2-3build2) ...
Selecting previously unselected package libgvpr2:amd64.
Preparing to unpack .../05-libgvpr2_2.42.2-3build2_amd64.deb ...
Unpacking libgvpr2:amd64 (2.42.2-3build2) ...
Selecting previously unselected package libxdot4:amd64.
Preparing to unpack .../06-libxdot4_2.42.2-3build2_amd64.deb ...
Unpacking libxdot4:amd64 (2.42.2-3build2) ...
Selecting previously unselected package liblab-gamut1:amd64.
Preparing to unpack .../07-liblab-gamut1_2.42.2-3build2_amd64.deb ...
Unpacking liblab-gamut1:amd64 (2.42.2-3build2) ...
Selecting previously unselected package libgvc6-plugins-gtk.
Preparing to unpack .../08-libgvc6-plugins-gtk_2.42.2-3build2_amd64.deb ...
Unpacking libgvc6-plugins-gtk (2.42.2-3build2) ...
Selecting previously unselected package libgraphviz-dev:amd64.
Preparing to unpack .../09-libgraphviz-dev_2.42.2-3build2_amd64.deb ...
Unpacking libgraphviz-dev:amd64 (2.42.2-3build2) ...
Selecting previously unselected package libgts-bin.
Preparing to unpack .../10-libgts-bin_0.7.6+darcs121130-4_amd64.deb ...
Unpacking libgts-bin (0.7.6+darcs121130-4) ...
Setting up libxdot4:amd64 (2.42.2-3build2) ...
Setting up liblab-gamut1:amd64 (2.42.2-3build2) ...
Setting up libgts-0.7-5:amd64 (0.7.6+darcs121130-4) ...
Setting up libpathplan4:amd64 (2.42.2-3build2) ...
Setting up libcdt5:amd64 (2.42.2-3build2) ...
Setting up libcgraph6:amd64 (2.42.2-3build2) ...
Setting up libgts-bin (0.7.6+darcs121130-4) ...
Setting up libgvc6 (2.42.2-3build2) ...
Setting up libgvc6-plugins-gtk (2.42.2-3build2) ...
Setting up libgvpr2:amd64 (2.42.2-3build2) ...
Setting up libgraphviz-dev:amd64 (2.42.2-3build2) ...
Processing triggers for libc-bin (2.31-0ubuntu9.1) ...
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libann0
Suggested packages:
  gsfonts graphviz-doc
The following NEW packages will be installed:
  graphviz libann0
0 upgraded, 2 newly installed, 0 to remove and 6 not upgraded.
Need to get 616 kB of archives.
After this operation, 3,255 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 libann0 amd64 1.1.2+doc-7build1 [26.0 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 graphviz amd64 2.42.2-3build2 [590 kB]
Fetched 616 kB in 0s (13.2 MB/s)  
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package libann0.
(Reading database ... 80320 files and directories currently installed.)
Preparing to unpack .../libann0_1.1.2+doc-7build1_amd64.deb ...
Unpacking libann0 (1.1.2+doc-7build1) ...
Selecting previously unselected package graphviz.
Preparing to unpack .../graphviz_2.42.2-3build2_amd64.deb ...
Unpacking graphviz (2.42.2-3build2) ...
Setting up libann0 (1.1.2+doc-7build1) ...
Setting up graphviz (2.42.2-3build2) ...
Processing triggers for libc-bin (2.31-0ubuntu9.1) ...
Collecting cello-classify
  Downloading cello_classify-2.0.3-py3-none-any.whl (11.2 MB)
     |████████████████████████████████| 11.2 MB 3.7 MB/s eta 0:00:01
Requirement already satisfied: pandas>=0.23.4 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (1.2.3)
Collecting quadprog>=0.1.6
  Downloading quadprog-0.1.10.tar.gz (121 kB)
     |████████████████████████████████| 121 kB 98.0 MB/s eta 0:00:01
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
    Preparing wheel metadata ... done
Requirement already satisfied: h5py>=2.10.0 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (3.1.0)
Requirement already satisfied: dill>=0.3.1.1 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (0.3.3)
Requirement already satisfied: scipy>=1.3.1 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (1.6.1)
Requirement already satisfied: Cython>=0.29.17 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (0.29.22)
Collecting pygraphviz
  Downloading pygraphviz-1.7.zip (118 kB)
     |████████████████████████████████| 118 kB 124.5 MB/s eta 0:00:01
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.8/site-packages (from cello-classify) (3.3.4)
Requirement already satisfied: anndata>=0.7.1 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (0.7.5)
Requirement already satisfied: numpy>=1.17.1 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (1.20.1)
Requirement already satisfied: scikit-learn>=0.22.2.post1 in /opt/conda/lib/python3.8/site-packages (from cello-classify) (0.24.1)
Requirement already satisfied: natsort in /opt/conda/lib/python3.8/site-packages (from anndata>=0.7.1->cello-classify) (7.1.1)
Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from anndata>=0.7.1->cello-classify) (20.9)
Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.8/site-packages (from pandas>=0.23.4->cello-classify) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.8/site-packages (from pandas>=0.23.4->cello-classify) (2021.1)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas>=0.23.4->cello-classify) (1.15.0)
Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.22.2.post1->cello-classify) (1.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.22.2.post1->cello-classify) (2.1.0)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.8/site-packages (from matplotlib->cello-classify) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /opt/conda/lib/python3.8/site-packages (from matplotlib->cello-classify) (2.4.7)
Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.8/site-packages (from matplotlib->cello-classify) (8.1.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.8/site-packages (from matplotlib->cello-classify) (1.3.1)
Building wheels for collected packages: quadprog, pygraphviz
  Building wheel for quadprog (PEP 517) ... done
  Created wheel for quadprog: filename=quadprog-0.1.10-cp38-cp38-linux_x86_64.whl size=473848 sha256=39bd78d3042c3ae05752835fed52bd8b6c61605b2f285410b0cfd7a7e95b10ae
  Stored in directory: /tmp/pip-ephem-wheel-cache-grm91jzx/wheels/1a/73/9e/643ca13f12dc01433ccd2e40c4167b8b0fdf69b53038ae80d0
  Building wheel for pygraphviz (setup.py) ... done
  Created wheel for pygraphviz: filename=pygraphviz-1.7-cp38-cp38-linux_x86_64.whl size=230529 sha256=88545541dc6d734ee0e5632813b7dbd8ab267b65a7f447cc0d539d50a24f9671
  Stored in directory: /tmp/pip-ephem-wheel-cache-grm91jzx/wheels/77/c6/8b/566e1c26d0f9d41093e28d8e1768f1412d54846e3296114feb
Successfully built quadprog pygraphviz
Installing collected packages: quadprog, pygraphviz, cello-classify
Successfully installed cello-classify-2.0.3 pygraphviz-1.7 quadprog-0.1.10


import os
import cello


adata = adata.raw.to_adata()
adata

AnnData object with n_obs × n_vars = 8824 × 22010
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Doublet_score', 'Is_doublet', 'QC', 'leiden', 'S_score', 'G2M_score', 'phase', 'leiden_1.2', 'leiden_1.5', 'leiden_2.0', 'kmeans15', 'kmeans20', 'kmeans25', 'hclust15', 'hclust20', 'hclust25'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'paga', 'leiden_sizes', 'leiden_1.2_colors', 'leiden_1.5_colors', 'leiden_2.0_colors', 'kmeans15_colors', 'kmeans20_colors', 'kmeans25_colors', 'hclust15_colors', 'hclust20_colors', 'hclust25_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'


new_adata = sc.read_10x_mtx('../data/soupX_pbmc10k_filt', cache = True)
new_adata

... reading from cache file cache/..-data-soupX_pbmc10k_filt-matrix.h5ad

AnnData object with n_obs × n_vars = 10194 × 36601
    var: 'gene_ids', 'feature_types'


new_adata = new_adata[adata.obs_names,adata.var_names]
new_adata

View of AnnData object with n_obs × n_vars = 8824 × 22010
    var: 'gene_ids', 'feature_types'


sc.pp.normalize_total(new_adata, target_sum=1e6)
sc.pp.log1p(new_adata)
sc.pp.highly_variable_genes(new_adata, n_top_genes=10000)
new_adata

/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:138: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

AnnData object with n_obs × n_vars = 8824 × 22010
    var: 'gene_ids', 'feature_types', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'


sc.pp.pca(new_adata, n_comps=50, use_highly_variable=True)
sc.pp.neighbors(new_adata, n_neighbors=15)
sc.tl.leiden(new_adata, resolution=2.0)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:01:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Leiden clustering
    finished: found 26 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)


new_adata.X = new_adata.X.todense()


cello_resource_loc = os.getcwd()


! wget https://www.dropbox.com/s/x8n6jxa9ygw3s1w/10k_pbmc.model.dill

--2021-11-19 20:10:55--  https://www.dropbox.com/s/x8n6jxa9ygw3s1w/10k_pbmc.model.dill
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6020:18::a27d:4012
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/x8n6jxa9ygw3s1w/10k_pbmc.model.dill [following]
--2021-11-19 20:10:55--  https://www.dropbox.com/s/raw/x8n6jxa9ygw3s1w/10k_pbmc.model.dill
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com/cd/0/inline/BaTYnz0epf0zvlUjvirJQ91TUFQKN7QJ9pJIm2aSbdCGhthK5UwRjpP_oPqghFvPfdM5LjuugI-sZNZpIG-ZrHDkwlEE2GDMYwPLXoZRQlSaaUl25e5BTrKEb9XEhKC-k8558MIEh6fH3qbBB4KDoz5y/file# [following]
--2021-11-19 20:10:56--  https://uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com/cd/0/inline/BaTYnz0epf0zvlUjvirJQ91TUFQKN7QJ9pJIm2aSbdCGhthK5UwRjpP_oPqghFvPfdM5LjuugI-sZNZpIG-ZrHDkwlEE2GDMYwPLXoZRQlSaaUl25e5BTrKEb9XEhKC-k8558MIEh6fH3qbBB4KDoz5y/file
Resolving uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com (uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com)... 162.125.65.15, 2620:100:6020:15::a27d:400f
Connecting to uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com (uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com)|162.125.65.15|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /cd/0/inline2/BaShvVgLOGe7Ev-2jyvw4c8XfYyWpJotR2MRUTF9PDtvW-PgZdfje4RL1SbnpD4bD5Q2mzDJieHsxMnNRYJbUZgtS1SlP4zR1grtLgL4DT6wqWXQZNdgtDPZBCdD33i0ONyKtFhngQAQLmn7NtJD53T96YrIVQ7zkMSz--9jlJAhJ7YdAZubNeFxZqV21HkPWODoIkCxQCzkCkhWA-CoIxiFVU1GbZfNtyIONtmHEmifEVfB5sR4nQpvaNm5U6vFa3PAfqUhPdSwqbWheWvMCBbRNg1ANMl5WPbavAyxM8RC1zkoVXqhkt-jz82iWYOeGxmcVLWSjBEjbZk6c3SElXwWZS8f1A3tRbyFwcSedaZZteuQxdFv1SRdgocUrdsgAX8/file [following]
--2021-11-19 20:10:56--  https://uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com/cd/0/inline2/BaShvVgLOGe7Ev-2jyvw4c8XfYyWpJotR2MRUTF9PDtvW-PgZdfje4RL1SbnpD4bD5Q2mzDJieHsxMnNRYJbUZgtS1SlP4zR1grtLgL4DT6wqWXQZNdgtDPZBCdD33i0ONyKtFhngQAQLmn7NtJD53T96YrIVQ7zkMSz--9jlJAhJ7YdAZubNeFxZqV21HkPWODoIkCxQCzkCkhWA-CoIxiFVU1GbZfNtyIONtmHEmifEVfB5sR4nQpvaNm5U6vFa3PAfqUhPdSwqbWheWvMCBbRNg1ANMl5WPbavAyxM8RC1zkoVXqhkt-jz82iWYOeGxmcVLWSjBEjbZk6c3SElXwWZS8f1A3tRbyFwcSedaZZteuQxdFv1SRdgocUrdsgAX8/file
Reusing existing connection to uc5253187eac9ade511b213500c4.dl.dropboxusercontent.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 418052370 (399M) [application/octet-stream]
Saving to: ‘10k_pbmc.model.dill’

10k_pbmc.model.dill 100%[===================>] 398.69M  28.4MB/s    in 17s     

2021-11-19 20:11:14 (23.9 MB/s) - ‘10k_pbmc.model.dill’ saved [418052370/418052370]


model_prefix = "10k_pbmc"

cello.scanpy_cello(
    new_adata, 
    clust_key='leiden',
    rsrc_loc=cello_resource_loc,  
    model_file=f'{model_prefix}.model.dill'
)

Loading model from 10k_pbmc.model.dill...

/opt/conda/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator PCA from version 1.0.1 when using version 0.24.1. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator LogisticRegression from version 1.0.1 when using version 0.24.1. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(

        Could not find the CellO resources directory called
        'resources' in '/home/jovyan/notebooks/notebooks'. Will download resources to current 
        directory.
        
Running command: curl http://deweylab.biostat.wisc.edu/cell_type_classification/resources_v2.0.0.tar.gz > /home/jovyan/notebooks/notebooks/resources_v2.0.0.tar.gz
Running command: tar -C /home/jovyan/notebooks/notebooks -zxf resources_v2.0.0.tar.gz
Running command: rm /home/jovyan/notebooks/notebooks/resources_v2.0.0.tar.gz

Variable names are not unique. To make them unique, call `.var_names_make_unique`.

Transforming with PCA...
done.
Making predictions for each classifier...
Running solver on item 1/26...
Running solver on item 2/26...
Running solver on item 3/26...
Running solver on item 4/26...
Running solver on item 5/26...
Running solver on item 6/26...
Running solver on item 7/26...
Running solver on item 8/26...
Running solver on item 9/26...
Running solver on item 10/26...
Running solver on item 11/26...
Running solver on item 12/26...
Running solver on item 13/26...
Running solver on item 14/26...
Running solver on item 15/26...
Running solver on item 16/26...
Running solver on item 17/26...
Running solver on item 18/26...
Running solver on item 19/26...
Running solver on item 20/26...
Running solver on item 21/26...
Running solver on item 22/26...
Running solver on item 23/26...
Running solver on item 24/26...
Running solver on item 25/26...
Running solver on item 26/26...
Checking if any pre-trained model is compatible with this input dataset...

/opt/conda/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator PCA from version 0.22.2.post1 when using version 0.24.1. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator LogisticRegression from version 0.22.2.post1 when using version 0.24.1. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(

Of 22010 genes in the input file, 16803 were found in the training set of 58243 genes.
Of 22010 genes in the input file, 16115 were found in the training set of 31283 genes.
Using thresholds stored in /home/jovyan/notebooks/notebooks/resources/trained_models/ir.10x_genes_thresholds.tsv
Binarizing classifications...
Mapping each sample to its predicted labels...
Computing the most-specific predicted labels...
Loading ontology...
Item 0 predicted to be "naive thymus-derived CD4-positive, alpha-beta T cell (CL:0000895)"
Item 1 predicted to be "central memory CD4-positive, alpha-beta T cell (CL:0000904)"
Item 10 predicted to be "naive B cell (CL:0000788)"
Item 11 predicted to be "naive thymus-derived CD4-positive, alpha-beta T cell (CL:0000895)"
Item 12 predicted to be "naive thymus-derived CD4-positive, alpha-beta T cell (CL:0000895)"
Item 13 predicted to be "CD14-positive, CD16-negative classical monocyte (CL:0002057)"
Item 14 predicted to be "naive thymus-derived CD4-positive, alpha-beta T cell (CL:0000895)"
Item 15 predicted to be "natural killer cell (CL:0000623)"
Item 16 predicted to be "dendritic cell (CL:0000451)"
Item 17 predicted to be "central memory CD4-positive, alpha-beta T cell (CL:0000904)"
Item 18 predicted to be "natural killer cell (CL:0000623)"
Item 19 predicted to be "effector memory CD4-positive, alpha-beta T cell (CL:0000905)"
Item 2 predicted to be "CD14-positive, CD16-negative classical monocyte (CL:0002057)"
Item 20 predicted to be "CD4-positive, alpha-beta memory T cell (CL:0000897)"
Item 21 predicted to be "mononuclear cell (CL:0000842)"
Item 22 predicted to be "effector memory CD4-positive, alpha-beta T cell (CL:0000905)"
Item 23 predicted to be "mononuclear cell (CL:0000842)"
Item 24 predicted to be "plasmablast (CL:0000980)"
Item 25 predicted to be "platelet (CL:0000233)"
Item 3 predicted to be "CD14-positive, CD16-negative classical monocyte (CL:0002057)"
Item 4 predicted to be "naive thymus-derived CD4-positive, alpha-beta T cell (CL:0000895)"
Item 5 predicted to be "CD14-positive, CD16-negative classical monocyte (CL:0002057)"
Item 6 predicted to be "naive thymus-derived CD4-positive, alpha-beta T cell (CL:0000895)"
Item 7 predicted to be "CD14-positive, CD16-negative classical monocyte (CL:0002057)"
Item 8 predicted to be "CD14-positive, CD16-negative classical monocyte (CL:0002057)"
Item 9 predicted to be "memory B cell (CL:0000787)"


sc.tl.umap(new_adata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:07)


fig = sc.pl.umap(new_adata, color='leiden', title='Clusters', return_fig=True)

... storing 'Most specific cell type' as categorical
... storing 'feature_types' as categorical


out_file = '10k_pbmc_clusters_res2.0.pdf' # <-- Name of the output file

fig.savefig(out_file, bbox_inches='tight', format='pdf')


import matplotlib as mpl

with mpl.rc_context({'figure.figsize': (10, 10)}):
    sc.pl.umap(new_adata, color='Most specific cell type',palette="Paired")


sc.pl.umap(new_adata, color=["FCGR3A","FCER1A","LILRA4"])


with mpl.rc_context({'figure.figsize': (10, 10)}):
    sc.pl.umap(new_adata, color='natural killer cell (probability)', vmin=0.0, vmax=1.0, return_fig=True)


with mpl.rc_context({'figure.figsize': (10, 10)}):
    sc.pl.umap(new_adata, color='nucleate cell (probability)', vmin=0.0, vmax=1.0, return_fig=True)


with mpl.rc_context({'figure.figsize': (10, 10)}):
    sc.pl.umap(new_adata, color='natural killer cell (binary)', vmin=0.0, vmax=1.0, return_fig=True)


fig, ax = cello.cello_probs(new_adata, '11', cello_resource_loc, 0.5, clust_key='leiden');

	n_genes_by_counts	total_counts	total_counts_mt	pct_counts_mt	total_counts_rb	pct_counts_rb
AAACCCACATAACTCG-1	4734	22196.0	1171.0	5.275725	5564.0	25.067579
AAACCCACATGTAACC-1	2190	7629.0	674.0	8.834710	2583.0	33.857651
AAACCCAGTGAGTCAG-1	4246	21358.0	1342.0	6.283360	4117.0	19.276150
AAACCCAGTGCTTATG-1	342	857.0	269.0	31.388565	15.0	1.750292
AAACGAACAGTCAGTT-1	4074	15006.0	1188.0	7.916833	2249.0	14.987339

	Doublet_score	Is_doublet
AAACCCACATAACTCG-1	0.305424	True
AAACCCACATGTAACC-1	0.019761	False
AAACCCAGTGAGTCAG-1	0.031399	False
AAACCCAGTGCTTATG-1	0.027550	False
AAACGAACAGTCAGTT-1	0.368680	True

	n_genes_by_counts	total_counts	total_counts_mt	pct_counts_mt	total_counts_rb	pct_counts_rb	Doublet_score	Is_doublet
TTTGTTGGTCCGTACG-1	3927	16565.0	1146.0	6.918201	2569.0	15.508602	0.029041	False
TTTGTTGGTCGGAACA-1	1650	6364.0	331.0	5.201131	2928.0	46.008797	0.019189	False
TTTGTTGGTTTACGAC-1	1387	4943.0	158.0	3.196440	2189.0	44.284847	0.109827	False
TTTGTTGTCGTTATCT-1	2601	8080.0	534.0	6.608911	1285.0	15.903465	0.040512	False
TTTGTTGTCTTTGCTA-1	3590	16095.0	1085.0	6.741224	2883.0	17.912395	0.054908	False

	n_genes_by_counts	total_counts	total_counts_mt	pct_counts_mt	total_counts_rb	pct_counts_rb	Doublet_score	Is_doublet
AAACCCACATAACTCG-1	4734	22196.0	1171.0	5.275725	5564.0	25.067579	0.305424	True
AAACCCACATGTAACC-1	2190	7629.0	674.0	8.834710	2583.0	33.857651	0.019761	False
AAACCCAGTGAGTCAG-1	4246	21358.0	1342.0	6.283360	4117.0	19.276150	0.031399	False
AAACCCAGTGCTTATG-1	342	857.0	269.0	31.388565	15.0	1.750292	0.027550	False
AAACGAACAGTCAGTT-1	4074	15006.0	1188.0	7.916833	2249.0	14.987339	0.368680	True

	Is_doublet	n_genes_by_counts	pct_counts_mt	QC
AAACCCACATAACTCG-1	True	4734	5.275725	Doublet
AAACCCACATGTAACC-1	False	2190	8.834710	Pass
AAACCCAGTGAGTCAG-1	False	4246	6.283360	Pass
AAACCCAGTGCTTATG-1	False	342	31.388565	Low_nFeature
AAACGAACAGTCAGTT-1	True	4074	7.916833	Doublet
AAACGAACATTCGGGC-1	False	2284	7.763345	Pass
AAACGAAGTGATCATC-1	True	2167	6.348788	Doublet
AAACGAAGTTCGGGTC-1	False	2151	14.321314	Pass
AAACGAAGTTGAATCC-1	False	5133	8.643090	Pass
AAACGAATCCGTGGTG-1	True	3037	6.800667	Doublet

Scanpy tutorial using 10k PBMCs dataset¶

PART 1. Basic quality control and filtering.¶

PART 2. Normalization and dimensionality reduction.¶

PART 3. Normalization and careful clustering of the cleaned dataset.¶

PART 4. Differential expression and marker selection.¶

PART 5. Cell ontology-based automatic cell type prediction with `CellO`¶

	gene_ids	feature_types	n_cells	mt	rb	n_cells_by_counts	mean_counts	pct_dropout_by_counts	total_counts
AL627309.1	AL627309.1	Gene Expression	40	False	False	40	0.003924	99.607612	40.0
AL627309.5	AL627309.5	Gene Expression	522	False	False	522	0.056013	94.879341	571.0
AL627309.4	AL627309.4	Gene Expression	12	False	False	12	0.001177	99.882284	12.0
AL669831.2	AL669831.2	Gene Expression	7	False	False	7	0.000687	99.931332	7.0
LINC01409	LINC01409	Gene Expression	671	False	False	671	0.071807	93.417697	732.0

Marker	Cell type
LYZ	monocytes
MS4A1	B cells
CD8B	CD8 T cells
CCR7	CD4 naive T cells
IL32	CD4 memory T cells
NKG7	natural killer cells
LILRA4	plasmacytoid dendritic cells
FCER1A	myeloid dendritic cells
PPBP	platelets

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	RPL30	CD8B	IL32	S100A8	TNFSF10	LYZ	CCL5	MS4A1	IGHD	FCGR3A	GNLY	HLA-DPB1	TCF7	KLRB1	LILRA4	TXNDC5	TUBB1
1	RPS27	CD8A	LTB	S100A9	FGL2	CD36	NKG7	CD79A	IGHM	LST1	NKG7	FCER1A	RPL4	GZMK	PLD4	TNFRSF17	GP1BB
2	RPS13	RPS12	IL7R	S100A12	FTH1	VCAN	CTSW	BANK1	CD79A	AIF1	PRF1	HLA-DQA1	EEF1G	KLRG1	ITM2C	MZB1	PF4
3	RPL11	RPS3A	TRAC	VCAN	LAP3	S100A9	GZMA	CD79B	CD79B	COTL1	KLRD1	HLA-DRB1	BCL11B	GZMA	IRF8	SEC11C	PPBP
4	RPS15A	RPS6	CD2	MNDA	IFITM3	TYMP	CST7	CD37	CD74	SMIM25	KLRF1	HLA-DRA	LDHB	NKG7	UGCG	JCHAIN	NRGN

Scanpy tutorial using 10k PBMCs dataset¶

PART 1. Basic quality control and filtering.¶

PART 2. Normalization and dimensionality reduction.¶

PART 3. Normalization and careful clustering of the cleaned dataset.¶

PART 4. Differential expression and marker selection.¶

PART 5. Cell ontology-based automatic cell type prediction with CellO¶

PART 5. Cell ontology-based automatic cell type prediction with `CellO`¶