biocore · sherlyn99 · Feb 8, 2025 · Feb 7, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/micov/_plot.py b/micov/_plot.py
@@ -288,7 +288,7 @@ def coverage_curve(metadata_full, coverage_full, positions, target, variable, ou
     ax.set_ylim(0, 100)
     ax.set_title((f'{tag}: {target_name}({target}) '
                   f'({length}bp)'), fontsize=16)
-    ax.legend(labels, fontsize=14)
+    ax.legend(labels, fontsize=14, loc='center left')
 
     plt.tight_layout()
 
@@ -410,18 +410,44 @@ def position_plot(metadata, coverage, positions, target, variable, output,
 
     target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target).lazy()
 
-    value_order = metadata.select(pl.col(variable)
-                                    .unique()
-                                    .sort())[variable]
+    samples_with_positions = (target_positions.select(pl.col(COLUMN_SAMPLE_ID))
+                                              .unique()
+                                              .collect())[COLUMN_SAMPLE_ID]
+    metadata = metadata.filter(pl.col(COLUMN_SAMPLE_ID)
+                                 .is_in(samples_with_positions))
+
+    # TODO: expose to allow ordering by a variable rather than coverage
+    custom_yorder = None
+
+    group_order = metadata.group_by(variable).len().sort(by='len')
+    max_x = group_order['len'].sum()
 
+    color_order = (metadata.select(pl.col(variable)
+                                      .unique()
+                                      .sort())
+                           .with_row_index(name='color')
+                           .select([pl.col(variable), pl.col('color')]))
+    order = group_order.join(color_order, on=variable).sort(by='len')
+
+
+    x_offset = 0
+    boundaries = []
     tsv_x = []
     tsv_y = []
     tsv_group = []
 
-    for name, color in zip(value_order, range(0, 10)):
+    for name, count, color in order[variable, 'len', 'color'].iter_rows():
         grp = metadata.filter(pl.col(variable) == name)
         color = f'C{color}'
-        grp_coverage = ordered_coverage(coverage, grp, target, length)
+
+        if custom_yorder is not None:
+            grp_coverage = (grp.filter(pl.col(custom_yorder).is_not_null())
+                               .sort(by=custom_yorder)
+                               .with_row_index(name='x_unscaled', offset=x_offset)
+                               .with_columns(pl.lit(target).alias(COLUMN_GENOME_ID)))
+        else:
+            grp_coverage = ordered_coverage(coverage, grp, target, length)
+            grp_coverage = grp_coverage.with_columns(pl.col('x_unscaled') + x_offset)
 
         if len(grp_coverage) == 0:
             continue
@@ -432,12 +458,12 @@ def position_plot(metadata, coverage, positions, target, variable, output,
         hist_x = []
         hist_y = []
 
-        col_selection = [COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, 'x']
+        col_selection = [COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, 'x_unscaled']
         for sid, gid, x in grp_coverage[col_selection].rows():
             cur_positions = (target_positions
                                  .filter(pl.col(COLUMN_SAMPLE_ID) == sid)
                                  .join(grp_coverage.lazy(), on=COLUMN_SAMPLE_ID)
-                                 .select(pl.col('x'),
+                                 .select(pl.col('x_unscaled'),
                                          pl.col(COLUMN_START) / length,
                                          pl.col(COLUMN_STOP) / length))
 
@@ -465,9 +491,15 @@ def position_plot(metadata, coverage, positions, target, variable, output,
             tsv_y += hist_y
             tsv_group += [name] * len(hist_x)
 
-    ax.set_xlim(-0.01, 1.0)
+        x_offset += count
+        boundaries.append(x_offset)
+
+    ax.set_xlim(0, max_x)
     ax.set_ylim(0, 1.0)
 
+    for x in boundaries[:-1]:
+        ax.plot([x, x], [0, 1], color='k', ls='--', alpha=0.6)
+
     if scale is None:
         ax.set_title(f'Position plot: {target} ({length}bp)', fontsize=20)
         ax.set_ylabel('Unit normalized genome position', fontsize=20)
@@ -487,16 +519,19 @@ def position_plot(metadata, coverage, positions, target, variable, output,
         ax.set_ylabel(f'Coverage (1/{scale})th scale', fontsize=20)
         scaletag = f"-1_{scale}th-scale"
 
-    ax.set_xlabel('Proportion of samples', fontsize=20)
+    ax.set_xlabel('Within group sample rank by coverage', fontsize=16)
 
-    plt.legend(labels, fontsize=20)
+    plt.legend(labels, fontsize=20, loc='center left')
     leg = ax.get_legend()
     for i, lh in enumerate(leg.legend_handles):
         lh.set_color(colors[i])
         lh._sizes = [5.0, ]
 
-    ax.tick_params(axis='both', which='major', labelsize=16)
-    ax.tick_params(axis='both', which='minor', labelsize=16)
+    ax.tick_params(axis='y', which='major', labelsize=16)
+    ax.tick_params(axis='y', which='minor', labelsize=16)
+    ax.set_xticks([])
+    ax.grid(axis='y', ls='--', alpha=1, color='k')
+
     plt.tight_layout()
     plt.savefig(f'{output}.{target_name}.{target}.{variable}.position-plot{scaletag}.png')
     plt.close()
diff --git a/micov/cli.py b/micov/cli.py
@@ -239,6 +239,80 @@ def qiita_to_parquet(qiita_coverages, lengths, output, samples_to_keep,
                                     compression_level=3)  # default afaik
 
 
+@cli.command()
+@click.option('--pattern', type=str,
+              required=True, help='Glob pattern for BED3-like files. Must end '
+                                  'in .cov or .cov.gz')
+@click.option('--output', type=click.Path(exists=False))
+@click.option('--lengths', type=click.Path(exists=True), required=True,
+              help="Genome lengths")
+@click.option('--memory', type=str, default='16gb', required=False)
+@click.option('--threads', type=int, default=4, required=False)
+def nonqiita_to_parquet(pattern, lengths, output, memory, threads):
+    """Aggregate BED3 files to parquet."""
+    lengths = parse_genome_lengths(lengths)
+
+    columns = "{'genome_id': 'VARCHAR', 'start': 'UINTEGER', 'stop': 'UINTEGER'}"
+    duckdb.sql(f"SET memory_limit TO '{memory}'")
+    duckdb.sql(f"SET threads TO {threads}")
+    duckdb.sql("CREATE TABLE genome_lengths AS FROM lengths")
+
+    # stream the .cov or .cov.gz files into parquet. Extract the name of the
+    # file, without the extension, and store as the sample_id
+    duckdb.sql(f"""
+        COPY (SELECT genome_id,
+                     start,
+                     stop,
+                     regexp_extract(filename,
+                                    '^(.*/)?(.+).cov(.gz)?$', 2) AS sample_id
+              FROM read_csv('{pattern}',
+                            delim='\t',
+                            filename=true,
+                            header=true,
+                            columns={columns}))
+        TO '{output}.covered_positions.parquet'
+            (FORMAT PARQUET, PARQUET_VERSION V2,
+             COMPRESSION zstd)"""
+    )
+
+    # scan the aggregated position information, compute the amount covered
+    # and the percent coverage per sample per genome, stream to parquet.
+    duckdb.sql(f"""
+        COPY (WITH covered_amount AS (
+                  SELECT sample_id,
+                         genome_id,
+                         SUM(stop - start)::UINTEGER AS covered
+                  FROM read_parquet('{output}.covered_positions.parquet')
+                  GROUP BY sample_id, genome_id)
+              SELECT sample_id,
+                     genome_id,
+                     covered,
+                     length,
+                     (covered / length) * 100 AS percent_covered
+              FROM covered_amount JOIN genome_lengths using (genome_id))
+        TO '{output}.coverage.parquet'
+            (FORMAT PARQUET, PARQUET_VERSION V2,
+             COMPRESSION zstd)"""
+    )
+
+    # n.b. a comparable action can be taken with polars. however, polars does
+    # not currently allow limiting memory, and in testing, the use exceeded
+    # 16gb.
+    #(pl.scan_csv(pattern,
+    #             separator='\t',
+    #             has_header=True,
+    #             schema=pl.Schema({'genome_id': str,
+    #                               'start': pl.UInt32,
+    #                               'stop': pl.UInt32}),
+    #             include_file_paths='filename')
+    #   .with_columns(pl.col('filename')
+    #                   .str.extract(r"(.+).cov.gz$")
+    #                   .alias('sample_id'))
+    #   .drop('filename')
+    #   .sink_parquet(f"{output}.covered_positions_pl.parquet",
+    #                 compression='zstd'))
+
+
 @cli.command()
 @click.option('--parquet-coverage', type=click.Path(exists=False),
               required=True, help=('Pre-computed coverage data as parquet. '