Skip to content

Commit

Permalink
Define a constant for shared pandas.read_csv() options
Browse files Browse the repository at this point in the history
Start with just an explicit encoding. Applied to all invocations of
read_csv().

The current motivation of this change is to enforce UTF-8 encoding.
Another way to address that motivation would be to use Augur's internal
open_file() in place of the first argument to read_csv() to handle
encoding options as well as supported compression formats - but that's
not a trivial change.
  • Loading branch information
victorlin committed Feb 12, 2024
1 parent 51a77a9 commit a4601a2
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 9 deletions.
2 changes: 2 additions & 0 deletions augur/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import networkx as nx
from itertools import islice
from .errors import AugurError
from .io.file import PANDAS_READ_CSV_OPTIONS
from argparse import SUPPRESS
from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name

Expand Down Expand Up @@ -64,6 +65,7 @@ def read_in_clade_definitions(clade_file):
sep='\t' if clade_file.endswith('.tsv') else ',',
comment='#',
na_filter=False,
**PANDAS_READ_CSV_OPTIONS,
)

clade_inheritance_rows = df[df['gene'] == 'clade']
Expand Down
3 changes: 2 additions & 1 deletion augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
ID_COLUMN as SEQUENCE_INDEX_ID_COLUMN,
DELIMITER as SEQUENCE_INDEX_DELIMITER,
)
from augur.io.file import open_file
from augur.io.file import PANDAS_READ_CSV_OPTIONS, open_file
from augur.io.metadata import InvalidDelimiter, Metadata, read_metadata
from augur.io.sequences import read_sequences, write_sequences
from augur.io.print import print_err
Expand Down Expand Up @@ -70,6 +70,7 @@ def run(args):
sep=SEQUENCE_INDEX_DELIMITER,
index_col=SEQUENCE_INDEX_ID_COLUMN,
dtype={SEQUENCE_INDEX_ID_COLUMN: "string"},
**PANDAS_READ_CSV_OPTIONS,
)

# Remove temporary index file, if it exists.
Expand Down
4 changes: 4 additions & 0 deletions augur/io/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@

ENCODING = "utf-8"

PANDAS_READ_CSV_OPTIONS = {
'encoding': ENCODING,
}


@contextmanager
def open_file(path_or_buffer, mode="r", **kwargs):
Expand Down
6 changes: 4 additions & 2 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from augur.errors import AugurError
from augur.io.print import print_err
from augur.types import DataErrorMethod
from .file import open_file
from .file import PANDAS_READ_CSV_OPTIONS, open_file


DEFAULT_DELIMITERS = (',', '\t')
Expand Down Expand Up @@ -95,6 +95,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
metadata_file,
iterator=True,
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)
chunk = metadata.read(nrows=1)
metadata.close()
Expand Down Expand Up @@ -153,7 +154,8 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id

return pd.read_csv(
metadata_file,
**kwargs
**kwargs,
**PANDAS_READ_CSV_OPTIONS,
)


Expand Down
3 changes: 2 additions & 1 deletion augur/measurements/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys

from augur.argparse_ import HideAsFalseAction
from augur.io.file import PANDAS_READ_CSV_OPTIONS
from augur.utils import first_line, write_json
from augur.validate import (
measurements as read_measurements_json,
Expand Down Expand Up @@ -106,7 +107,7 @@ def run(args):

# Load input collection TSV file
try:
collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include)
collection_df = pd.read_csv(args.collection, sep="\t", usecols=columns_to_include, **PANDAS_READ_CSV_OPTIONS)
except FileNotFoundError:
print(
f"ERROR: collection TSV file {args.collection!r} does not exist",
Expand Down
4 changes: 2 additions & 2 deletions augur/sequence_traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from treetime.vcf_utils import read_vcf
from collections import defaultdict
from .io.file import open_file
from .io.file import PANDAS_READ_CSV_OPTIONS, open_file
from .utils import write_json, get_json_name

def read_in_translate_vcf(vcf_file, ref_file):
Expand Down Expand Up @@ -166,7 +166,7 @@ def read_in_features(drm_file):

mutPositions = defaultdict(list)

df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',')
df = pd.read_csv(drm_file, sep='\t' if drm_file.endswith('.tsv') else ',', **PANDAS_READ_CSV_OPTIONS)
for mi, m in df.iterrows():
pos = m.SITE-1 #put in python numbering
gene = m.GENE if hasattr(m, 'GENE') else 'nuc'
Expand Down
6 changes: 3 additions & 3 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .__version__ import __version__

from augur.data import as_file
from augur.io.file import open_file
from augur.io.file import PANDAS_READ_CSV_OPTIONS, open_file
from augur.io.print import print_err

from augur.types import ValidationMode
Expand Down Expand Up @@ -699,11 +699,11 @@ def read_bed_file(bed_file):
mask_sites = []
try:
bed = pd.read_csv(bed_file, sep='\t', header=None, usecols=[1,2],
dtype={1:int,2:int})
dtype={1:int,2:int}, **PANDAS_READ_CSV_OPTIONS)
except ValueError:
# Check if we have a header row. Otherwise, just fail.
bed = pd.read_csv(bed_file, sep='\t', header=None, usecols=[1,2],
dtype={1:int,2:int}, skiprows=1)
dtype={1:int,2:int}, skiprows=1, **PANDAS_READ_CSV_OPTIONS)
print("Skipped row 1 of %s, assuming it is a header." % bed_file)
for _, row in bed.iterrows():
mask_sites.extend(range(row[1], row[2]))
Expand Down

0 comments on commit a4601a2

Please sign in to comment.