niklasmueboe · niklasmueboe · Jun 16, 2024 · Jun 15, 2024 · Jun 16, 2024
diff --git a/Project.toml b/Project.toml
@@ -8,7 +8,6 @@ BlockArrays       = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 CSV               = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames        = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-DimensionalData   = "0703355e-b756-11e9-17c0-8b28908087d0"
 ImageFiltering    = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
 LinearAlgebra     = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays      = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -33,7 +32,6 @@ BlockArrays       = "0.16,1"
 CSV               = "0.10"
 CategoricalArrays = "0.9,0.10"
 DataFrames        = "1"
-DimensionalData   = "0.25"
 ImageFiltering    = "0.7"
 LinearAlgebra     = "1"
 Muon              = "0.2"

diff --git a/docs/make.jl b/docs/make.jl
@@ -4,7 +4,8 @@ using StereoSSAM
 
 links = InterLinks(
     # "CategoricalArrays" => "https://categoricalarrays.juliadata.org/stable/",
-    # "DimensionalData" => "https://rafaqz.github.io/DimensionalData.jl/dev/",
+    # "DataFrames" => "https://dataframes.juliadata.org/stable/",
+    # "OffsetArrays" => "https://juliaarrays.github.io/OffsetArrays.jl/stable/",
     "Muon" => "https://scverse.org/Muon.jl/dev/",
     "SparseArrays" => (
         "https://docs.julialang.org/en/v1/stdlib/SparseArrays/",

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -12,6 +12,7 @@ Pages = ["api.md"]
 
 ```@autodocs
 Modules = [
+    StereoSSAM.GridCount,
     StereoSSAM.IO,
     StereoSSAM.Utils,
     StereoSSAM.KDE,

diff --git a/docs/src/examples/ExampleAnalysis.jl b/docs/src/examples/ExampleAnalysis.jl
@@ -26,29 +26,28 @@ color_file = joinpath(data_path, "celltype_colors.json");
 
 We read the data from file using [`readstereoseq`](@ref). The input will 
 usually be a GEM file of a StereoSeq experiment. The output is a 
-`DimensionalData.DimArray` of gene counts stored as 
+[`GridCounts`](@ref) of gene counts stored as 
 [`SparseArrays.SparseMatrixCSC`](@extref). The input data stems from the
 original [Stereo-seq publication](https://doi.org/10.1016/j.cell.2022.04.003) and can
 be downloaded [here](https://db.cngb.org/stomics/mosta/download/).
 =#
 
-counts = readstereoseq(stereoseq_file);
+counts = readstereoseq(stereoseq_file)
 
 #=
 We also load pre-defined cell-type signatures
 =#
 
 using CSV
 using DataFrames
-using DimensionalData
 
 ## read pre-determined cell-type signatures
 signatures = CSV.read(signature_file, DataFrame);
 celltypes = signatures.Celltype;
 select!(signatures, Not(:Celltype));
 
 ## remove genes that are not detected in Stereo-seq experiment
-signatures = signatures[:, names(signatures) .∈ [dims(counts, 1)]];
+signatures = signatures[:, names(signatures) .∈ [keys(counts)]];
 
 print(signatures[1:5, 1:8])
 
@@ -104,7 +103,7 @@ as `Muon.AnnData` object.
 
 using Muon
 
-ad = getlocalmaxima(AnnData, counts, lm, kernel; genes=names(signatures))
+adata = getlocalmaxima(AnnData, counts, lm, kernel; genes=names(signatures))
 
 #=
 ### Cell-type map
@@ -188,4 +187,4 @@ highly variable genes that can be used for analysis. We again here demonstrate i
 `Muon.AnnData`.
 =#
 
-ad = StereoSSAM.IO.readstereoseqbinned(AnnData, stereoseq_file, 50)
+adata = readstereoseqbinned(AnnData, stereoseq_file, 50)
diff --git a/docs/src/examples/ExampleAnalysis.md b/docs/src/examples/ExampleAnalysis.md
diff --git a/docs/src/examples/Project.toml b/docs/src/examples/Project.toml
@@ -2,7 +2,6 @@
 CSV                  = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Colors               = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataFrames           = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-DimensionalData      = "0703355e-b756-11e9-17c0-8b28908087d0"
 ImageIO              = "82e4d734-157c-48bb-816b-45c225c6df19"
 ImageMagick          = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 ImageShow            = "4e3cecfd-b093-5904-9786-8bbb286a6a31"

diff --git a/src/GridCount.jl b/src/GridCount.jl
@@ -0,0 +1,160 @@
+module GridCount
+
+export GridCounts, crop!, mask!, gridsize
+
+using Base.Threads: @threads
+using DataFrames: DataFrame, groupby, transform!
+using SparseArrays: SparseMatrixCSC, dropzeros!, findnz, nonzeros, sparse
+
+"""
+    AbstractGridCounts
+
+A stack of genes each consisting of a 2D count array of the same size.
+"""
+abstract type AbstractGridCounts end
+
+"""
+    GridCounts{G,T<:Number} <: AbstractGridCounts
+
+A stack of genes`::G` each consisting of a 2D count`::T` array of the same size.
+"""
+mutable struct GridCounts{G,T<:Number} <: AbstractGridCounts
+    counts::Dict{G,SparseMatrixCSC{T}}
+    shape::Tuple{Int,Int}
+    @doc """
+        GridCounts(counts::Dict)
+
+    All values of the dict must be arrays of the same size.
+    """
+    function GridCounts{G,T}(counts::Dict{G,SparseMatrixCSC{T}}) where {G,T<:Number}
+        shape = size(first(values(counts)))
+        for (_, c) in counts
+            if size(c) != shape
+                throw(DimensionMismatch(counts, "all `counts` must have the same size"))
+            end
+        end
+        return new{G,T}(counts, shape)
+    end
+end
+
+function GridCounts(counts::Dict{G,SparseMatrixCSC{T}}) where {G,T<:Number}
+    return GridCounts{G,T}(counts)
+end
+
+"""
+    GridCounts(df::DataFrame)
+
+Construct from a DataFrame with columns 'x' (`Integer`), 'y' (`Integer`), 'count', and 
+'geneID' (`Pool`).
+"""
+function GridCounts(df::DataFrame)
+    transform!(df, [:x, :y] .=> (x -> x .- (minimum(x) - one(eltype(x)))) .=> [:x, :y])
+
+    rows = maximum(df.x)
+    cols = maximum(df.y)
+
+    n = length(df.geneID.pool)
+    genes = Vector{eltype(df.geneID)}(undef, n)
+    counts = Vector{SparseMatrixCSC{eltype(df.count)}}(undef, n)
+    @threads for (i, (key, subdf)) in collect(enumerate(pairs(groupby(df, :geneID))))
+        genes[i] = key.geneID
+        counts[i] = sparse(subdf.x, subdf.y, subdf.count, rows, cols)
+    end
+
+    return GridCounts(Dict(zip(genes, counts)))
+end
+
+"""
+    GridCounts(df::DataFrame, binsize::Integer)
+
+Construct from a DataFrame with columns 'x' (`Real`), 'y' (`Real`), 'count', and 
+'geneID' (`Pool`) binning the data by `binsize`.
+"""
+function GridCounts(df::DataFrame, binsize::Integer)
+    transform!(df, [:x, :y] .=> (x -> x .- minimum(x)) .=> [:x, :y])
+    transform!(df, @. [:x, :y] => (x -> div(x, binsize) + 1) => [:x, :y])
+
+    return GridCounts(df)
+end
+
+Base.iterate(iter::GridCounts) = iterate(iter.counts)
+Base.iterate(iter::GridCounts, state) = iterate(iter.counts, state)
+Base.length(collection::GridCounts) = length(collection.counts)
+Base.eltype(type::GridCounts) = eltype(type.counts)
+Base.haskey(collection::GridCounts, key) = haskey(collection.counts, key)
+Base.getindex(collection::GridCounts, key) = getindex(collection.counts, key)
+Base.get(collection::GridCounts, key, default) = get(collection.counts, key, default)
+Base.delete!(collection::GridCounts, key) = delete!(collection.counts, key)
+Base.pop!(collection::GridCounts) = pop!(collection.counts)
+Base.keys(iterator::GridCounts) = keys(iterator.counts)
+Base.values(iterator::GridCounts) = values(iterator.counts)
+Base.pairs(collection::GridCounts) = pairs(collection.counts)
+Base.keytype(type::GridCounts) = keytype(type.counts)
+Base.valtype(type::GridCounts) = valtype(type.counts)
+
+function Base.setindex!(collection::GridCounts, value, key)
+    if size(value) == collection.shape
+        setindex!(collection.counts, value, key)
+    else
+        throw(DimensionMismatch(value, "all `counts` must have the same size"))
+    end
+end
+
+function Base.show(io::IO, x::GridCounts)
+    type = "GridCounts{" * string(keytype(x)) * ", " * string(eltype(valtype(x))) * "}"
+    return print(io, type, " ", x.shape, " with ", length(x), " genes")
+end
+# Base.show(io::IO, m::MIME"text/plain", x::GridCounts) = print(io, x)
+
+"""
+    gridsize(counts)
+
+The size of the grid of each layer.
+"""
+gridsize(counts) = counts.shape
+
+"""
+    crop!(counts, slice)
+
+Crop each gene layer in counts by indexing with `slice`.
+"""
+function crop!(counts, slice)
+    for (g, c) in counts
+        counts.counts[g] = c[slice...]
+    end
+    counts.shape = size(first(values(counts)))
+    return nothing
+end
+
+"""
+    mask!(counts, mask::AbstractMatrix{Bool})
+
+Remove all counts in each gene layer for which `mask` is `false`.
+"""
+
+function mask!(counts::GridCounts{<:Any,T}, mask::AbstractMatrix{Bool}) where {T<:Any}
+    inv_mask = .!mask
+    z = zero(T)
+    @threads for c in collect(values(counts))
+        x, y, _ = findnz(c)
+        nonzeros(c)[inv_mask[[CartesianIndex(i) for i in zip(x, y)]]] .= z
+        dropzeros!(c)
+    end
+end
+
+"""
+    totalrna(counts)
+
+Caclulate the totalrna as sum of all genes for each pixel.
+"""
+function totalrna(counts::GridCounts)
+    n = length(counts)
+    x, y, v = Vector{Vector}(undef, n), Vector{Vector}(undef, n), Vector{Vector}(undef, n)
+
+    @threads for (i, c) in collect(enumerate(values(counts)))
+        x[i], y[i], v[i] = findnz(c)
+    end
+    return sparse((reduce(vcat, i) for i in (x, y, v))..., gridsize(counts)...)
+end
+
+end # module GridCount
diff --git a/src/IO.jl b/src/IO.jl
@@ -1,17 +1,22 @@
 module IO
 
-export readstereoseq, readstereoseqbinned
+export readstereoseq, readstereoseqbinned, readGEMfile
 
 import ..Utils: categoricalcoordinates, stringcoordinates
 
+using ..GridCount: GridCounts
+
 using Base.Broadcast: @__dot__
-using Base.Threads: @threads
 using CSV: read as readcsv
-using DataFrames: DataFrame, Not, groupby, rename!, select!, transform!
-using DimensionalData: DimArray, Dim
-using SparseArrays: SparseMatrixCSC, sparse
+using DataFrames: DataFrame, Not, rename!, select!, transform!
+using SparseArrays: sparse
+
+"""
+    readGEMfile(file)
 
-function loadstereoseqfile(file)
+Read `file` in GEM format as `DataFrames.DataFrame`.
+"""
+function readGEMfile(file)
     countcol_name = ["MIDCounts", "MIDCount", "UMICount"]
     countcol_type = Dict(zip(countcol_name, Iterators.repeated(Int16)))
 
@@ -27,42 +32,29 @@ function loadstereoseqfile(file)
         end
     end
 
-    transform!(df, [:x, :y] .=> (x -> x .- (minimum(x) - one(eltype(x)))) .=> [:x, :y])
-
     return df
 end
 
 """
     readstereoseq(file)
 
-Read StereoSeq `file` as vector of SparseMatrixCSC.
+Read StereoSeq `file` as [`GridCounts`](@ref).
 """
 function readstereoseq(file)
-    df = loadstereoseqfile(file)
-
-    rows = maximum(df.x)
-    cols = maximum(df.y)
-
-    n = length(df.geneID.pool)
-    genes = Vector{eltype(df.geneID)}(undef, n)
-    counts = Vector{SparseMatrixCSC{eltype(df.count)}}(undef, n)
-    @threads for (i, (key, subdf)) in collect(enumerate(pairs(groupby(df, :geneID))))
-        genes[i] = key.geneID
-        counts[i] = sparse(subdf.x, subdf.y, subdf.count, rows, cols)
-    end
-
-    return DimArray(counts, (Dim{:gene}(genes)))
+    df = readGEMfile(file)
+    return GridCounts(df)
 end
 
 """
-    readstereoseqbinned(file, s::Integer)
+    readstereoseqbinned(file, binsize::Integer)
 
-Read StereoSeq `file` and aggregate locations with bin size `s`.
+Read StereoSeq `file` and aggregate into bins.
 """
-function readstereoseqbinned(file, s::Integer)
-    df = loadstereoseqfile(file)
+function readstereoseqbinned(file, binsize::Integer)
+    df = readGEMfile(file)
 
-    transform!(df, @. [:x, :y] => (x -> div(x - 1, s) + 1) => [:x, :y])
+    transform!(df, [:x, :y] .=> (x -> x .- minimum(x)) .=> [:x, :y])
+    transform!(df, @. [:x, :y] => (x -> div(x, binsize) + 1) => [:x, :y])
 
     cat_coord, (x, y) = categoricalcoordinates(df.x, df.y)
     select!(df, Not([:x, :y]))