Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved position plots #29

Merged
merged 4 commits into from
Feb 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 48 additions & 13 deletions micov/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def coverage_curve(metadata_full, coverage_full, positions, target, variable, ou
ax.set_ylim(0, 100)
ax.set_title((f'{tag}: {target_name}({target}) '
f'({length}bp)'), fontsize=16)
ax.legend(labels, fontsize=14)
ax.legend(labels, fontsize=14, loc='center left')

plt.tight_layout()

Expand Down Expand Up @@ -410,18 +410,44 @@ def position_plot(metadata, coverage, positions, target, variable, output,

target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target).lazy()

value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
samples_with_positions = (target_positions.select(pl.col(COLUMN_SAMPLE_ID))
.unique()
.collect())[COLUMN_SAMPLE_ID]
metadata = metadata.filter(pl.col(COLUMN_SAMPLE_ID)
.is_in(samples_with_positions))

# TODO: expose to allow ordering by a variable rather than coverage
custom_yorder = None

group_order = metadata.group_by(variable).len().sort(by='len')
max_x = group_order['len'].sum()

color_order = (metadata.select(pl.col(variable)
.unique()
.sort())
.with_row_index(name='color')
.select([pl.col(variable), pl.col('color')]))
order = group_order.join(color_order, on=variable).sort(by='len')


x_offset = 0
boundaries = []
tsv_x = []
tsv_y = []
tsv_group = []

for name, color in zip(value_order, range(0, 10)):
for name, count, color in order[variable, 'len', 'color'].iter_rows():
grp = metadata.filter(pl.col(variable) == name)
color = f'C{color}'
grp_coverage = ordered_coverage(coverage, grp, target, length)

if custom_yorder is not None:
grp_coverage = (grp.filter(pl.col(custom_yorder).is_not_null())
.sort(by=custom_yorder)
.with_row_index(name='x_unscaled', offset=x_offset)
.with_columns(pl.lit(target).alias(COLUMN_GENOME_ID)))
else:
grp_coverage = ordered_coverage(coverage, grp, target, length)
grp_coverage = grp_coverage.with_columns(pl.col('x_unscaled') + x_offset)

if len(grp_coverage) == 0:
continue
Expand All @@ -432,12 +458,12 @@ def position_plot(metadata, coverage, positions, target, variable, output,
hist_x = []
hist_y = []

col_selection = [COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, 'x']
col_selection = [COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, 'x_unscaled']
for sid, gid, x in grp_coverage[col_selection].rows():
cur_positions = (target_positions
.filter(pl.col(COLUMN_SAMPLE_ID) == sid)
.join(grp_coverage.lazy(), on=COLUMN_SAMPLE_ID)
.select(pl.col('x'),
.select(pl.col('x_unscaled'),
pl.col(COLUMN_START) / length,
pl.col(COLUMN_STOP) / length))

Expand Down Expand Up @@ -465,9 +491,15 @@ def position_plot(metadata, coverage, positions, target, variable, output,
tsv_y += hist_y
tsv_group += [name] * len(hist_x)

ax.set_xlim(-0.01, 1.0)
x_offset += count
boundaries.append(x_offset)

ax.set_xlim(0, max_x)
ax.set_ylim(0, 1.0)

for x in boundaries[:-1]:
ax.plot([x, x], [0, 1], color='k', ls='--', alpha=0.6)

if scale is None:
ax.set_title(f'Position plot: {target} ({length}bp)', fontsize=20)
ax.set_ylabel('Unit normalized genome position', fontsize=20)
Expand All @@ -487,16 +519,19 @@ def position_plot(metadata, coverage, positions, target, variable, output,
ax.set_ylabel(f'Coverage (1/{scale})th scale', fontsize=20)
scaletag = f"-1_{scale}th-scale"

ax.set_xlabel('Proportion of samples', fontsize=20)
ax.set_xlabel('Within group sample rank by coverage', fontsize=16)

plt.legend(labels, fontsize=20)
plt.legend(labels, fontsize=20, loc='center left')
leg = ax.get_legend()
for i, lh in enumerate(leg.legend_handles):
lh.set_color(colors[i])
lh._sizes = [5.0, ]

ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
ax.tick_params(axis='y', which='major', labelsize=16)
ax.tick_params(axis='y', which='minor', labelsize=16)
ax.set_xticks([])
ax.grid(axis='y', ls='--', alpha=1, color='k')

plt.tight_layout()
plt.savefig(f'{output}.{target_name}.{target}.{variable}.position-plot{scaletag}.png')
plt.close()
74 changes: 74 additions & 0 deletions micov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,80 @@ def qiita_to_parquet(qiita_coverages, lengths, output, samples_to_keep,
compression_level=3) # default afaik


@cli.command()
@click.option('--pattern', type=str,
required=True, help='Glob pattern for BED3-like files. Must end '
'in .cov or .cov.gz')
@click.option('--output', type=click.Path(exists=False))
@click.option('--lengths', type=click.Path(exists=True), required=True,
help="Genome lengths")
@click.option('--memory', type=str, default='16gb', required=False)
@click.option('--threads', type=int, default=4, required=False)
def nonqiita_to_parquet(pattern, lengths, output, memory, threads):
"""Aggregate BED3 files to parquet."""
lengths = parse_genome_lengths(lengths)

columns = "{'genome_id': 'VARCHAR', 'start': 'UINTEGER', 'stop': 'UINTEGER'}"
duckdb.sql(f"SET memory_limit TO '{memory}'")
duckdb.sql(f"SET threads TO {threads}")
duckdb.sql("CREATE TABLE genome_lengths AS FROM lengths")

# stream the .cov or .cov.gz files into parquet. Extract the name of the
# file, without the extension, and store as the sample_id
duckdb.sql(f"""
COPY (SELECT genome_id,
start,
stop,
regexp_extract(filename,
'^(.*/)?(.+).cov(.gz)?$', 2) AS sample_id
FROM read_csv('{pattern}',
delim='\t',
filename=true,
header=true,
columns={columns}))
TO '{output}.covered_positions.parquet'
(FORMAT PARQUET, PARQUET_VERSION V2,
COMPRESSION zstd)"""
)

# scan the aggregated position information, compute the amount covered
# and the percent coverage per sample per genome, stream to parquet.
duckdb.sql(f"""
COPY (WITH covered_amount AS (
SELECT sample_id,
genome_id,
SUM(stop - start)::UINTEGER AS covered
FROM read_parquet('{output}.covered_positions.parquet')
GROUP BY sample_id, genome_id)
SELECT sample_id,
genome_id,
covered,
length,
(covered / length) * 100 AS percent_covered
FROM covered_amount JOIN genome_lengths using (genome_id))
TO '{output}.coverage.parquet'
(FORMAT PARQUET, PARQUET_VERSION V2,
COMPRESSION zstd)"""
)

# n.b. a comparable action can be taken with polars. however, polars does
# not currently allow limiting memory, and in testing, the use exceeded
# 16gb.
#(pl.scan_csv(pattern,
# separator='\t',
# has_header=True,
# schema=pl.Schema({'genome_id': str,
# 'start': pl.UInt32,
# 'stop': pl.UInt32}),
# include_file_paths='filename')
# .with_columns(pl.col('filename')
# .str.extract(r"(.+).cov.gz$")
# .alias('sample_id'))
# .drop('filename')
# .sink_parquet(f"{output}.covered_positions_pl.parquet",
# compression='zstd'))


@cli.command()
@click.option('--parquet-coverage', type=click.Path(exists=False),
required=True, help=('Pre-computed coverage data as parquet. '
Expand Down
Loading