Skip to content

Commit

Permalink
Merge pull request #31 from biocore/improved_position_plots
Browse files Browse the repository at this point in the history
Allow region slicing
  • Loading branch information
wasade authored Feb 13, 2025
2 parents fc7dc16 + 03455a4 commit ba9cd5b
Show file tree
Hide file tree
Showing 16 changed files with 2,200 additions and 953 deletions.
1 change: 0 additions & 1 deletion ci/conda_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
tqdm
scipy
pyarrow<16.0.0
polars-u64-idx>=1.21.0
Expand Down
10 changes: 9 additions & 1 deletion micov/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
"""micov: microbiome coverage."""

from . import _version
__version__ = _version.get_versions()['version']

__version__ = _version.get_versions()["version"]
# note: currently for use with duckdb. we cannot easily enforce threads for polars
# as a specific environment variable must be set prior to the first import. it's
# doable but will need some engineeering to do it correctly.'And, polars does not
# currently have a way to limit memory use.
THREADS = 1
MEMORY = 8 # gb
119 changes: 76 additions & 43 deletions micov/_constants.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,57 @@
import polars as pl

COLUMN_GENOME_ID = 'genome_id'
COLUMN_GENOME_ID = "genome_id"
COLUMN_GENOME_ID_DTYPE = str
COLUMN_SAMPLE_ID = 'sample_id'
COLUMN_SAMPLE_ID = "sample_id"
COLUMN_SAMPLE_ID_DTYPE = str
COLUMN_START = 'start'
COLUMN_START = "start"
COLUMN_START_DTYPE = pl.UInt32
COLUMN_STOP = 'stop'
COLUMN_STOP = "stop"
COLUMN_STOP_DTYPE = pl.UInt32
COLUMN_READ_ID = 'read'
COLUMN_READ_ID = "read"
COLUMN_READ_ID_DTYPE = str
COLUMN_FLAG = 'flag'
COLUMN_FLAG = "flag"
COLUMN_FLAG_DTYPE = int
COLUMN_CIGAR = 'cigar'
COLUMN_CIGAR = "cigar"
COLUMN_CIGAR_DTYPE = str
COLUMN_LENGTH = 'length'
COLUMN_LENGTH_DTYPE = int
COLUMN_TAXONOMY = 'taxonomy'
COLUMN_LENGTH = "length"
COLUMN_LENGTH_DTYPE = pl.UInt32
COLUMN_TAXONOMY = "taxonomy"
COLUMN_TAXONOMY_DTYPE = str
COLUMN_COVERED = 'covered'
COLUMN_COVERED = "covered"
COLUMN_COVERED_DTYPE = pl.UInt32
COLUMN_PERCENT_COVERED = 'percent_covered'
COLUMN_PERCENT_COVERED = "percent_covered"
COLUMN_PERCENT_COVERED_DTYPE = float

### should really probably just use a dataclass, and type annotations?


class _SCHEMA:
def __init__(self):
self.dtypes_dict = dict(self.dtypes_flat)
self.columns = tuple([c for c, _ in self.dtypes_flat])


class _BED_COV_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
]


BED_COV_SCHEMA = _BED_COV_SCHEMA()


class _BED_COV_WITH_SAMPLEID_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE),
]


BED_COV_SAMPLEID_SCHEMA = _BED_COV_WITH_SAMPLEID_SCHEMA()


Expand All @@ -52,48 +61,72 @@ class _SAM_SUBSET_SCHEMA(_SCHEMA):
# for binary coverage, we don't care about the flag, but we're parsing it
# now so we can care in the future.
column_indices = [0, 1, 2, 3, 5]
dtypes_flat = [(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE)]
dtypes_flat = [
(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE),
]


SAM_SUBSET_SCHEMA = _SAM_SUBSET_SCHEMA()


class _SAM_SUBSET_SCHEMA_PARSED(_SCHEMA):
dtypes_flat = [(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE)]
dtypes_flat = [
(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
]


SAM_SUBSET_SCHEMA_PARSED = _SAM_SUBSET_SCHEMA_PARSED()


class _GENOME_LENGTH_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
]


GENOME_LENGTH_SCHEMA = _GENOME_LENGTH_SCHEMA()


class _GENOME_TAXONOMY_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE),
]


GENOME_TAXNOMY_SCHEMA = _GENOME_TAXONOMY_SCHEMA()


class _GENOME_COVERAGE_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE),
]


GENOME_COVERAGE_SCHEMA = _GENOME_COVERAGE_SCHEMA()


class _GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE),
]


GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA = _GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA()
Loading

0 comments on commit ba9cd5b

Please sign in to comment.