Skip to content

Commit

Permalink
export onestop function
Browse files Browse the repository at this point in the history
  • Loading branch information
lmw123 committed Dec 17, 2021
0 parents commit e485a65
Show file tree
Hide file tree
Showing 15 changed files with 744 additions and 0 deletions.
26 changes: 26 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Package: FastIntegration
Type: Package
Title: Fast Integration of Single-cell Data
Version: 0.1.0
Author: c(
person(given = "Mengwei", family = "Li", email = "[email protected]", role = "aut"),
person(given = "Xiaomeng", family = "Zhang", email = "[email protected]", role = "aut"),
person(given = "Jinmiao", family = "Chen", email = "[email protected]", role = "aut")
)
Description: FastIntegrate integrates thousands of scRNA-seq datasets and outputs batch-corrected values for downstream analysis
License: GPL-3
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
Depends:
R (>= 4.0.0),
Seurat (>= 4.0.0),
SeuratObject (>= 4.0.0),
data.table
Imports:
Matrix,
tictoc,
dplyr,
parallel,
pbmcapply
LinkingTo: Rcpp (>= 0.11.0), RcppEigen
20 changes: 20 additions & 0 deletions FastIntegration.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
18 changes: 18 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by roxygen2: do not edit by hand

export(BuildIntegrationFile)
export(FastFindAnchors)
export(FastIntegration)
export(OneStopIntegration)
import(Matrix)
import(Rcpp)
import(RcppEigen)
import(tictoc)
importFrom(dplyr,`%>%`)
importFrom(dplyr,arrange)
importFrom(dplyr,desc)
importFrom(dplyr,group_by)
importFrom(dplyr,n)
importFrom(dplyr,slice)
importFrom(dplyr,summarise)
useDynLib(FastIntegration)
266 changes: 266 additions & 0 deletions R/FastFindAnchors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
#' @import Matrix
#' @importFrom dplyr summarise
#' @importFrom dplyr n
#' @export
FastFindAnchors = function(
tmp.dir = NULL,
nCores = NULL,
k.filter = 100,
k.anchor = 5
) {

nCores = nCores %||% parallel::detectCores()

object.ncells = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/others/object_ncells.rds"))
offsets = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/others/offsets.rds"))
features = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/others/features.rds"))
nSample = length(object.ncells)

combinations = expand.grid(1:nSample, 1:nSample)
combinations = combinations[combinations$Var1 < combinations$Var2, , drop = FALSE]

message("Finding all pairwise anchors")

if (nSample > 100) {
rna.list = pbmcapply::pbmclapply(
1:nSample, function(i) {
rna = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/", i,".rds"))
rna = rna@assays$RNA@data[features,]
return(rna)
}
)

ncell = c()
for (i in 1:length(rna.list)) {
ncell = append(ncell, ncol(rna.list[[i]]))
}

rna.average = pbmcapply::pbmclapply(
rna.list, function(rna) {
rna = rowMeans(rna)
return(rna)
}, mc.cores = 20
)
rna.average = do.call(cbind, rna.average)
rna.cor = cor(rna.average)

sample.tree = Seurat:::BuildSampleTree(rna.cor)

paired = c(1, 2)
for (j in 1:nrow(sample.tree)) {
idx = Seurat:::ParseMergePair(sample.tree, j)
if (sum(ncell[idx$object2]) > sum(ncell[idx$object1])) {
tmp = idx$object2
idx$object2 = idx$object1
idx$object1 = tmp
}
p = lapply(idx$object2, function(n) {
a = idx$object1[which(rank(-rna.cor[n, idx$object1]) <= 5)]
return(cbind(n, a))
})
paired = rbind(paired, do.call(rbind, p))
}
paired = paired[-1,]
paired = data.frame(paired)
colnames(paired) = c("Var1", "Var2")
combinations = paired[!duplicated(paired),]
}

all.anchors = pbmcapply::pbmclapply(
X = 1:nrow(x = combinations),
FUN = function(row) {
FindAnchorsPair(
combinations = combinations,
row = row,
offsets = offsets,
k.filter = k.filter,
k.anchor = k.anchor,
tmp.dir = tmp.dir,
features = features
)
},
mc.cores = nCores
)

all.anchors = do.call(what = 'rbind', args = all.anchors)
all.anchors = rbind(all.anchors, all.anchors[, c(2, 1, 3)])
# AddDatasetID = getFromNamespace("AddDatasetID", "Seurat")
all.anchors = Seurat:::AddDatasetID(anchor.df = all.anchors, offsets = offsets, obj.lengths = object.ncells)
all.anchors = data.table::as.data.table(all.anchors)
data.table::setindex(all.anchors, dataset2)
data.table::setindex(all.anchors, dataset1)

message("Merging data")
if (nSample <= 100) {
anchor.group <- all.anchors %>% group_by(dataset1, dataset2) %>% summarise(n = n())
similarity.matrix = matrix(data = 0, ncol = nSample, nrow = nSample)
similarity.matrix[lower.tri(similarity.matrix, diag=FALSE) | upper.tri(similarity.matrix, diag=FALSE)] = anchor.group$n


ncell.a = matrix(data = NA, ncol = nSample, nrow = nSample)
for (i in 1:nSample) {
ncell.a[i,] = object.ncells[i]
}

ncell.b = matrix(data = NA, ncol = nSample, nrow = nSample)
for (i in 1:nSample) {
ncell.b[,i] = object.ncells[i]
}
ncel.min = pmin(ncell.a, ncell.b)
similarity.matrix = similarity.matrix/ncel.min
similarity.matrix[upper.tri(x = similarity.matrix, diag = TRUE)] = NA
# BuildSampleTree = getFromNamespace("BuildSampleTree", "Seurat")
sample.tree = Seurat:::BuildSampleTree(similarity.matrix)

rna.bind = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/1.rds"))
rna.bind = rna.bind@assays$RNA@data[features,]
for (i in 2:nSample) {
rna = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/", i,".rds"))
rna.bind = cbind(rna.bind, rna@assays$RNA@data[features,])
}
} else {
n = ceiling(nSample/100)
rna.list = pbmcapply::pbmclapply(
1:n, function(i) {
start = (i-1)*100 + 1
end = min(nSample, i*100)
rna.bind = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/", start,".rds"))
rna.bind = rna.bind@assays$RNA@data[features,]
for (j in (start+1):end) {
rna = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/", j,".rds"))
rna.bind = cbind(rna.bind, rna@assays$RNA@data[features,])
}
return(rna.bind)
}, mc.cores = n
)

rna.bind = rna.list[[1]]
for (i in 2:n) {
rna.bind = cbind(rna.bind, rna.list[[i]])
}
}

rna.bind = ScaleData(rna.bind, verbose = F)
pca = RunPCA(rna.bind, verbose = F)

saveRDS(pca, paste0(tmp.dir, "/FastIntegrationTmp/others/raw_pca.rds"), compress = F)
saveRDS(sample.tree, paste0(tmp.dir, "/FastIntegrationTmp/others/sample_tree.rds"), compress = F)
saveRDS(all.anchors, paste0(tmp.dir, "/FastIntegrationTmp/anchors/anchors.rds"), compress = F)
return(NULL)
}

FindAnchorsPair = function(
tmp.dir = NULL,
combinations,
row,
offsets,
k.filter,
k.anchor,
features
) {

i = combinations[row, 1]
j = combinations[row, 2]

object.list.1 = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/", i, ".rds"))
object.list.2 = readRDS(paste0(tmp.dir, "/FastIntegrationTmp/raw/", j, ".rds"))

object.list.1@assays$RNA@counts = object.list.1@assays$RNA@data
object.list.2@assays$RNA@counts = object.list.2@assays$RNA@data

features = c(Seurat::VariableFeatures(object.list.1),Seurat::VariableFeatures(object.list.2))
features = names(sort(-table(features)))[1:2000]
object.1 = subset(x = object.list.1, features = features)
object.2 = subset(x = object.list.2, features = features)

ncell1 = ncol(object.1)
ncell2 = ncol(object.2)

object.1 = Seurat::ScaleData(object.1, features = features)
object.2 = Seurat::ScaleData(object.2, features = features)

object.1[["ToIntegrate"]] = object.1[["RNA"]]
DefaultAssay(object = object.1) = "ToIntegrate"

object.2[["ToIntegrate"]] = object.2[["RNA"]]
DefaultAssay(object = object.2) = "ToIntegrate"


object.pair = Seurat::RunCCA(
object1 = object.1,
object2 = object.2,
assay1 = "ToIntegrate",
assay2 = "ToIntegrate",
features = features,
renormalize = FALSE,
rescale = FALSE,
num.cc = 30
)

object.pair = Seurat::L2Dim(object = object.pair, reduction = "cca")

anchors = Seurat:::FindAnchors(
object.pair = object.pair,
assay = c("ToIntegrate", "ToIntegrate"),
slot = "data",
cells1 = colnames(x = object.1),
cells2 = colnames(x = object.2),
internal.neighbors = list(NULL, NULL),
reduction = "cca.l2",
nn.reduction = "cca",
k.filter = k.filter,
k.anchor = k.anchor,
nn.method = "rann",
dims = 1:30
)
anchors[, 1] = anchors[, 1] + offsets[i]
anchors[, 2] = anchors[, 2] + offsets[j]
return(anchors)
}

#' @export
BuildIntegrationFile = function(
rna.list,
tmp.dir = NULL,
nCores = NULL,
nfeatures = 2000
) {
message("Checking the rna list")
rna.list = SeuratObject:::CheckDuplicateCellNames(rna.list)
nCores = nCores %||% parallel::detectCores()

if (is.null(tmp.dir)) {
message("Not specify the temporary directory, will create one in current directory")
tmp.dir = "./"
}

if(file.exists(tmp.dir) == F) {
stop("The temporary directory not exist")
} else {
dir.create(paste0(tmp.dir, "/FastIntegrationTmp"))
dir.create(paste0(tmp.dir, "/FastIntegrationTmp/raw"))
dir.create(paste0(tmp.dir, "/FastIntegrationTmp/anchors"))
dir.create(paste0(tmp.dir, "/FastIntegrationTmp/others"))
dir.create(paste0(tmp.dir, "/FastIntegrationTmp/inte"))
}

message("Splitting and saving rna list in tmp dir")
pbmcapply::pbmclapply(
1:length(rna.list), function(i) {
rna = rna.list[[i]]
rna@assays$RNA@counts = matrix(0)
saveRDS(rna, paste0(tmp.dir, "/FastIntegrationTmp/raw/", i, ".rds"), compress = F)
return(NULL)
}, mc.cores = nCores
)

features = Seurat::SelectIntegrationFeatures(rna.list, nfeatures = nfeatures, verbose = F)
object.ncells = sapply(X = rna.list, FUN = function(x) dim(x = x)[2])
offsets = as.vector(x = cumsum(x = c(0, object.ncells)))[1:length(x = rna.list)]

saveRDS(features, paste0(tmp.dir, "/FastIntegrationTmp/others/features.rds"))
saveRDS(object.ncells, paste0(tmp.dir, "/FastIntegrationTmp/others/object_ncells.rds"))
saveRDS(offsets, paste0(tmp.dir, "/FastIntegrationTmp/others/offsets.rds"))
return(NULL)
}

Loading

0 comments on commit e485a65

Please sign in to comment.