Skip to content

Commit

Permalink
export v2: Automatically minify output
Browse files Browse the repository at this point in the history
Automatically format the output JSON file based on file size, unless
either --minify-json or --no-minify-json (new option) is specified.

Functional tests added to cover all code paths. Code to generate a
Newick file was generated by ChatGPT.¹

¹ <https://chat.openai.com/share/5f54c5fe-ee27-4691-bbb4-f71b63970770>
  • Loading branch information
victorlin committed Dec 22, 2023
1 parent e742291 commit ba65ef2
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 4 deletions.
36 changes: 32 additions & 4 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Export version 2 JSON schema for visualization with Auspice
"""
import os
from pathlib import Path
import sys
import time
Expand All @@ -13,9 +14,13 @@
from .errors import AugurError
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .types import ValidationMode
from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors
from .utils import read_node_data, write_json, json_size, read_config, read_lat_longs, read_colors
from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError


MINIFY_THRESHOLD_MB = 5


# Set up warnings & exceptions
warn = warnings.warn
deprecationWarningsEmitted = False
Expand Down Expand Up @@ -875,10 +880,20 @@ def register_parser(parent_subparsers):
optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`")
optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)")

minify_group = parser.add_argument_group(
title="OPTIONAL MINIFY SETTINGS",
description=f"""
By default, output JSON files (both main and sidecar) are automatically minimized if
the size of the un-minified main JSON file exceeds {MINIFY_THRESHOLD_MB} MB. Use
these options to override that behavior.
"""
).add_mutually_exclusive_group()
minify_group.add_argument('--minify-json', action="store_true", help="always export JSONs without indentation or line returns.")
minify_group.add_argument('--no-minify-json', action="store_true", help="always export JSONs to be human readable.")

optional_settings = parser.add_argument_group(
title="OPTIONAL SETTINGS"
title="OTHER OPTIONAL SETTINGS"
)
optional_settings.add_argument('--minify-json', action="store_true", help="export JSONs without indentation or line returns")
root_sequence = optional_settings.add_mutually_exclusive_group()
root_sequence.add_argument('--include-root-sequence', action="store_true", help="Export an additional JSON containing the root sequence (reference sequence for vcf) used to identify mutations. The filename will follow the pattern of <OUTPUT>_root-sequence.json for a main auspice JSON of <OUTPUT>.json")
root_sequence.add_argument('--include-root-sequence-inline', action="store_true", help="Export the root sequence (reference sequence for vcf) used to identify mutations as part of the main dataset JSON. This should only be used for small genomes for file size reasons.")
Expand Down Expand Up @@ -1165,8 +1180,21 @@ def run(args):
if config.get("extensions"):
data_json["meta"]["extensions"] = config["extensions"]

# Should output be minified?
# User-specified arguments take precedence before determining behavior based
# on the size of the tree.
if args.minify_json or os.environ.get("AUGUR_MINIFY_JSON"):
minify = True
elif args.no_minify_json:
minify = False
else:
if json_size(data_json) > MINIFY_THRESHOLD_MB * 10**6:
minify = True
else:
minify = False

# Write outputs - the (unified) dataset JSON intended for auspice & perhaps the ref root-sequence JSON
indent = {"indent": None} if args.minify_json else {}
indent = {"indent": None} if minify else {}
if args.include_root_sequence or args.include_root_sequence_inline:
# Note - argparse enforces that only one of these args will be true
if 'reference' in node_data:
Expand Down
5 changes: 5 additions & 0 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def write_json(data, file_name, indent=(None if os.environ.get("AUGUR_MINIFY_JSO
json.dump(data, handle, indent=indent, sort_keys=sort_keys, cls=AugurJSONEncoder)


def json_size(data, indent=2):
"""Return size in bytes of a Python object in JSON string form."""
return len(json.dumps(data, indent=indent, cls=AugurJSONEncoder))


class AugurJSONEncoder(json.JSONEncoder):
"""
A custom JSONEncoder subclass to serialize data types used for various data
Expand Down
90 changes: 90 additions & 0 deletions tests/functional/export_v2/cram/minify-output.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
Setup

$ source "$TESTDIR"/_setup.sh

Define a function to generate a Newick tree of a given size. Use lengthy node
names to create larger file sizes with less recursion.

$ generate_newick() {
> local n=$1
> local prefix=$(printf 'N%.0s' {1..1000}})
> if [ $n -eq 1 ]; then
> echo "$prefix$n"
> else
> echo "($(generate_newick $((n-1))),$prefix$n)"
> fi
> }

A small tree is not automatically minified.
The unminified output is 16K which is considered acceptable.

$ echo "$(generate_newick 10);" > small_tree.nwk

$ ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null

$ head -c 20 output.json
{
"version": "v2", (no-eol)

$ du -h output.json
16K.* (re)

It can be forcefully minified with an argument.

$ ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --minify-json \
> --output output.json &>/dev/null

$ head -c 20 output.json
{"version": "v2", "m (no-eol)
It can also be forcefully minified by setting AUGUR_MINIFY_JSON to any value,
even if it may seem "falsey".
$ AUGUR_MINIFY_JSON=0 ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null
$ head -c 20 output.json
{"version": "v2", "m (no-eol)
$ du -h output.json
12K.* (re)
A large tree, when forcefully not minified, has an output size of 7.0M which is
considered large.
$ echo "$(generate_newick 500);" > big_tree.nwk
$ ${AUGUR} export v2 \
> --tree big_tree.nwk \
> --skip-validation \
> --no-minify-json \
> --output output.json &>/dev/null
$ head -c 20 output.json
{
"version": "v2", (no-eol)
$ du -h output.json
7.0M.* (re)
This means it is automatically minified.
$ ${AUGUR} export v2 \
> --tree big_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null
$ head -c 20 output.json
{"version": "v2", "m (no-eol)
$ du -h output.json
584K.* (re)

0 comments on commit ba65ef2

Please sign in to comment.