Leveraging cell-cell similarity from gene expression data for high-performance spatial and temporal cellular mappings.
Project description
ccsf
Leveraging cell-cell similarity for high-performance spatial and temporal cellular mappings from gene expression data (Cell Patterns, 2023)
CCSF is a replacement of PCA for gene expression and other tabular data analysis
CCSF is a cell-cell similarity-driven framework of genomic data analysis for high-fidelity dimensionality reduction, clustering, visualization, and spatial and temporal cellular mappings. The approach exploits the similarity features of the cells for the discovery of discriminative patterns in the data. For a wide variety of datasets, the proposed approach drastically improves the accuracies of visualization and spatial and temporal mapping analyses as compared to PCA and state-of-the-art techniques. Computationally, the method is about 15 times faster than the existing ones and thus provides an urgently needed technique for reliable and efficient analysis of genomic data.
Installation
The simplest way to install ccsf is to use
pip install ccsf scikit-learn==1.1.1
Sample data
To run the example codes below, you will need to download data files from here.
Example codes
Example 1 - Learning the best metric and number of data classes
# Import all the necessary Python packages
from ccsf import CCSF
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from scipy import stats
import numpy as np
import novosparc
import random
rngState=0 # For reproducibility
# Load TCGA data
data = pd.read_csv('tcga_data.csv', header=None,
delim_whitespace=False)
label = pd.read_csv('tcga_label.csv', header=None,
delim_whitespace=False).to_numpy()
data = data.to_numpy()
n = 2000 # Select 2000 points randomly from the data for learning the distance metric.
manifolder = CCSF(random_state=rngState)
random.seed(a=rngState)
sampleix = random.sample(range(data.shape[0]), int(n) )
dataSampled = data[sampleix]
numInitCls = int(np.floor(data.shape[0]/500))
dataSampled = stats.zscore(dataSampled, axis=0, ddof=1)
metric,numClass = manifolder.metric_learning(dataSampled, verbose=False)
print('metric for cMAP is:', metric)
# # You are supposed to see 'correlation' as the result.
Example 2 - Dimensionality reduction, visualization and clustering by CCSF
# Import all the necessary Python packages
from ccsf import CCSF
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from scipy import stats
import numpy as np
import novosparc
import random
rngState=0 # For reproducibility
# Load TCGA data
data = pd.read_csv('tcga_data.csv', header=None,
delim_whitespace=False)
label = pd.read_csv('tcga_label.csv', header=None,
delim_whitespace=False).to_numpy()
data= data.to_numpy()
feed_data = {'data': data}
manifolder = CCSF(random_state=rngState)
n = 2000
sampleix = random.sample(range(data.shape[0]), int(n))
dataSampled = data[sampleix]
metric='correlation'
numClass=33 # Choice of the numClass is similar to choosing the number of principal components in PCA
# If you want to learn the metric and number of data classes, please use the following line
# metric, numClass = manifolder.metric_learning(dataSampled, verbose=False)
dataSampled = stats.zscore(dataSampled, axis=0, ddof=1)
num_comp= np.array([numClass-1]) # The number of CDM components to be used
# to compute the cMAP
manifolder = CCSF(n_clusters=numClass, num_comp=num_comp,metric=metric)
embedding_CMAP = manifolder.cMap(data=feed_data)
embedding = embedding_CMAP[0]
plt.figure()
plt.title('cMAP visualization ')
plt.scatter(embedding[:,0],embedding[:,1],c=label.T,s=0.5)
plt.xlabel("cMAP1")
plt.ylabel("cMAP2")
plt.show()
Example 3 - Temporal mapping by CCSF
# Import all the necessary Python packages
from ccsf import CCSF
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from scipy import stats
import numpy as np
import novosparc
import random
rngState=0 # For reproducibility
data = pd.read_csv('organoidData.csv', header=None,
delim_whitespace=False)
label = pd.read_csv('organoidDataLabel.csv', header=None,
delim_whitespace=False).to_numpy()
data = data.to_numpy()
# Learning the metric for cPHATE
n = 2000
manifolder = CCSF(random_state=rngState)
random.seed(a = rngState)
sampleix = random.sample( range( data.shape[0] ), int(n) )
dataSampled=data[sampleix]
numInitCls = int(np.floor(data.shape[0]/500))
dataSampled = stats.zscore(dataSampled, axis=0, ddof=1)
metric='correlation'
numClass=33 # Choice of the numClass is similar to choosing the number of principal components in PCA
# If you want to learn the metric and number of data classes, please use the following line
# metric, numClass = manifolder.metric_learning(dataSampled, verbose=False)
data = stats.zscore(data, axis=0, ddof=1)
num_comp = np.array([numClass-1]) # Number of CCSF components
manifolder = CCSF(n_clusters=numClass, num_comp=num_comp,metric=metric,random_state=rngState)
# cPHATE
feed_data = {'data': data}
embedding_CPHATE = manifolder.cPHATE(data=feed_data)
embedding = embedding_CPHATE[0]
plt.figure()
plt.title('cPHATE for 32 CCIF components ')
plt.scatter(embedding[:,0],embedding[:,1],c=label.T,s=0.5)
plt.xlabel("cPHATE1")
plt.ylabel("cPHATE2")
plt.show()
Example 4 - Spatial mapping by CCSF
# Import all the necessary Python packages
from ccsf import CCSF
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from scipy import stats
import numpy as np
import novosparc
import random
rngState=0 # For reproducibility
# Read the BDTNP database
print ('Loading data ... ', end='', flush=True)
gene_names = np.genfromtxt('dge.txt', usecols=range(84),
dtype='str', max_rows=1)
dge = np.loadtxt('dge.txt', usecols=range(84), skiprows=1)
# Optional: downsample number of cells
cells_selected, dge = novosparc.pp.subsample_dge(dge, 3039, 3040)
num_cells = dge.shape[0]
data=dge
feed_data = {'data': data}
# Learning the metric for cSPARC
n=2000
random.seed(a=rngState)
sampleix = random.sample( range( data.shape[0] ), int(n) )
dataSampled=data[sampleix]
manifolder = CCSF(random_state=rngState)
dataSampled = stats.zscore(dataSampled, axis=0, ddof=1)
numInitCls=int(np.floor(data.shape[0]/500))
metric='correlation'
numClass=7 # Choice of the numClass is similar to choosing the number of principal components in PCA
# If you want to learn the metric and number of data classes, use the following line
# metric,numClass = manifolder.metric_learning(dataSampled, verbose=False)
print ('Reading the target space ... ', end='', flush=True)
# Read and use the bdtnp geometry
locations = np.loadtxt('geometry.txt', usecols=range(3), skiprows=1)
locations = locations[:, [0, 2]]
locations = locations[cells_selected, :] # downsample to the cells selected above
# Compute the spatial maps of the genes
manifolder = CCSF(n_clusters=numClass,num_comp=numClass-1,metric=metric)
embedding_CSPARC = manifolder.cSpaRc(data=feed_data,locations=locations)
gene='ftz'
#gene='sna' Try other gene in place of ftz
d=embedding_CSPARC[np.argwhere(gene_names == gene), :].flatten()
plt.figure()
plt.title('cSpaRc spatial reconstruction of ftz gene')
plt.scatter(locations[:,0],locations[:,1],c=d.T)
plt.show()
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
File details
Details for the file ccsf-1.0.8.tar.gz
.
File metadata
- Download URL: ccsf-1.0.8.tar.gz
- Upload date:
- Size: 24.3 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.11.4
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 732816c89d9f3635644571c63105f27fa249abf8620ae03abe79037252fce4ce |
|
MD5 | f040bc921ec86d88b5f6c8e1950d2867 |
|
BLAKE2b-256 | ae360d7e134547a28e290c65e6a12c052ed1a220255592ad201bcc0658226749 |
File details
Details for the file ccsf-1.0.8-py3-none-any.whl
.
File metadata
- Download URL: ccsf-1.0.8-py3-none-any.whl
- Upload date:
- Size: 23.6 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.11.4
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | cddf61d2f28f619b2b205c0c43b04663cf1e628595ec9d0d685d367b3036649d |
|
MD5 | 84e7d05eec04414b2eadb8ddd65409b0 |
|
BLAKE2b-256 | 8d77c73e00ec73406c9b632e10116047183359f0ca1df1b38a7c309d697e5015 |