Scanpy是非常流行的scRNA-seq处理软件之一。本文档复制了Pachter实验室的分析示例及官方文档,请参照原文获取更多信息。
wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz
import anndata
import matplotlib
import matplotlib.pyplot as pyplot
import numpy as np
import pandas as pd
import scanpy as sc
import os
import warnings
adata = sc.read_10x_mtx('scRNAseq/HumanPbmc3k/filtered_gene_bc_matrices/hg19', var_names='gene_symbols')
adata.var_names_make_unique()
matplotlib.rcParams.update({'font.size': 16})
warnings.filterwarnings('ignore')
expected_num_cells = min(adata.obs.shape[0] - 1, 3000)
knee = np.sort((np.array(adata.X.sum(axis=1))).flatten())[::-1]
fig = pyplot.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.loglog(knee, range(len(knee)), linewidth=5, color="g")
ax.axvline(x=knee[expected_num_cells], linewidth=3, color="k")
ax.axhline(y=expected_num_cells, linewidth=3, color="k")
ax.set_xlabel("UMI Counts")
ax.set_ylabel("Set of Barcodes")
pyplot.grid(True, which="both")
fig = pyplot.figure(figsize=(7, 7))
ax = fig.add_subplot(111)
x = np.asarray(adata.X.sum(axis=1))[:,0]
y = np.asarray(np.sum(adata.X>0, axis=1))[:,0]
ax.scatter(x, y, color="green", alpha=0.25)
ax.set_xlabel("UMI Counts")
ax.set_ylabel("Genes Detected")
ax.set_xscale('log')
ax.set_yscale('log', nonposy='clip')
ax.set_xlim((0.5, 10000))
ax.set_ylim((0.5, 10000))
pyplot.grid(True, which="both")
sc.set_figure_params(dpi_save=300, fontsize = 14)
sc.pl.highest_expr_genes(adata, n_top=20, )
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
adata.obs
sc.pl.violin(adata, keys = ['pct_counts_mt'], multi_panel=False, rotation=60)
sc.pp.filter_cells(adata, min_genes = 200)
sc.pp.filter_genes(adata, min_cells = 3)
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs['pct_counts_mt'] < 5, :]
adata
sc.pl.violin(adata, ['pct_counts_mt'], jitter=0.4, multi_panel=False, rotation=60)
adata.counts = adata.X
sc.pp.normalize_total(adata, target_sum=1e4, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True)
sc.pl.pca(adata, color='CST3')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'])
sc.tl.leiden(adata)
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])
sc.tl.rank_genes_groups(adata, 'leiden', method='logreg')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden')
sc.pl.rank_genes_groups_heatmap(adata, n_genes=3, show_gene_labels=True, cmap='bwr')
sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False)
marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']
sc.pl.dotplot(adata, marker_genes, groupby='leiden', dendrogram = False)
old_to_new = {
'0': 'CD4 T',
'1': 'CD4 T',
'2': 'CD14 Monocytes',
'3': 'B',
'4': 'CD8 T',
'5': 'FCGR3A Monocytes',
'6': 'NK',
'7': 'Dendritic',
'8': 'Megakaryocytes'}
adata.obs['new_clusters'] = adata.obs['leiden'].map(old_to_new).astype('category')
sc.pl.umap(adata, color='new_clusters', legend_loc='on data', title='', frameon=False)
sc.pl.dotplot(adata, marker_genes, groupby='new_clusters', dendrogram = True)
adata
adata.write('scRNAseq/HumanPbmc3k/HumanPbmc3k.h5ad', compression='gzip')