Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ComBGC: Add functionality to screen whole sample directory (antismash) #258

Merged
merged 7 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#251](https://github.com/nf-core/funcscan/pull/251) Added annotation tool: Pyrodigal. (by @jasmezz)
- [#252](https://github.com/nf-core/funcscan/pull/252) Added a new parameter `-arg_rgi_savejson` that saves the file `<samplename>.json` in the RGI directory. The default ouput for RGI is now only `<samplename>.txt`. (by @darcy220606)
- [#253](https://github.com/nf-core/funcscan/pull/253) Updated Prodigal to have compressed output files. (by @jasmezz)
- [#258](https://github.com/nf-core/funcscan/pull/258) Added comBGC function to screen whole directory of antiSMASH output (one subfolder per sample). (by @jasmezz)

### `Fixed`

Expand Down
64 changes: 56 additions & 8 deletions bin/comBGC.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
SOFTWARE.
"""

tool_version = "0.5"
tool_version = "0.6.0"
welcome = """\
........................
* comBGC v.{version} *
Expand Down Expand Up @@ -61,7 +61,9 @@
these can be:
- antiSMASH: <sample name>.gbk and (optional) knownclusterblast/ directory
- DeepBGC: <sample name>.bgc.tsv
- GECCO: <sample name>.clusters.tsv""",
- GECCO: <sample name>.clusters.tsv
Note: Please provide files from a single sample only. If you would like to
summarize multiple samples, please see the --antismash_dir flag.""",
)
parser.add_argument(
"-o",
Expand All @@ -73,6 +75,16 @@
type=str,
default=".",
)
parser.add_argument(
"-a",
"--antismash_dir",
jasmezz marked this conversation as resolved.
Show resolved Hide resolved
metavar="PATH",
dest="antismash_dir",
nargs="?",
help="""directory of antiSMASH output. Should contain subfolders (one per
sample). Can only be used if --input is not specified.""",
louperelo marked this conversation as resolved.
Show resolved Hide resolved
type=str,
)
parser.add_argument("-vv", "--verbose", help="increase output verbosity", action="store_true")
parser.add_argument("-v", "--version", help="show version number and exit", action="store_true")

Expand All @@ -81,6 +93,7 @@

# Assign input arguments to variables
input = args.input
dir_antismash = args.antismash_dir
outdir = args.outdir
verbose = args.verbose
version = args.version
Expand Down Expand Up @@ -111,15 +124,38 @@
elif path.endswith("knownclusterblast/"):
input_antismash.append(path)

if input and dir_antismash:
exit(
"The flags --input and --antismash_dir are mutually exclusive.\nPlease use only one of them (or see --help for how to use)."
)

# Make sure that at least one input argument is given
if not (input_antismash or input_gecco or input_deepbgc):
if not (input_antismash or input_gecco or input_deepbgc or dir_antismash):
exit("Please specify at least one input file (i.e. output from antismash, deepbgc, or gecco) or see --help")

louperelo marked this conversation as resolved.
Show resolved Hide resolved
########################
# ANTISMASH FUNCTIONS
########################


def prepare_multisample_input_antismash(antismash_dir):
"""
Prepare string of input paths of a given antiSMASH output folder (with sample subdirectories)
"""
sample_paths = []
for root, subdirs, files in os.walk(antismash_dir):
antismash_file = "/".join([root, "index.html"])
if os.path.exists(antismash_file):
sample = root.split("/")[-1]
gbk_path = "/".join([root, sample]) + ".gbk"
kkb_path = "/".join([root, "knownclusterblast"])
if os.path.exists(kkb_path):
sample_paths.append([gbk_path, kkb_path])
else:
sample_paths.append([gbk_path])
return sample_paths


def parse_knownclusterblast(kcb_file_path):
"""
Extract MIBiG IDs from knownclusterblast TXT file.
Expand Down Expand Up @@ -148,9 +184,6 @@ def antismash_workflow(antismash_paths):
- Return data frame with aggregated info.
"""

if verbose:
print("\nParsing antiSMASH files\n... ", end="")

antismash_sum_cols = [
"Sample_ID",
"Prediction_tool",
Expand Down Expand Up @@ -186,6 +219,9 @@ def antismash_workflow(antismash_paths):

# Aggregate information
Sample_ID = gbk_path.split("/")[-1].split(".gbk")[-2] # Assuming file name equals sample name
if verbose:
print("\nParsing antiSMASH file(s): " + Sample_ID + "\n... ", end="")

with open(gbk_path) as gbk:
for record in SeqIO.parse(gbk, "genbank"): # GBK records are contigs in this case
# Initiate variables per contig
Expand Down Expand Up @@ -514,7 +550,13 @@ def gecco_workflow(gecco_paths):
########################

if __name__ == "__main__":
tools = {"antiSMASH": input_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco}
if input_antismash:
tools = {"antiSMASH": input_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco}
elif dir_antismash:
tools = {"antiSMASH": dir_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco}
jasmezz marked this conversation as resolved.
Show resolved Hide resolved
else:
tools = {"antiSMASH": [], "deepBGC": input_deepbgc, "GECCO": input_gecco}

tools_provided = {}

for tool in tools.keys():
Expand All @@ -532,7 +574,13 @@ def gecco_workflow(gecco_paths):

for tool in tools_provided.keys():
if tool == "antiSMASH":
summary_antismash = antismash_workflow(input_antismash)
if dir_antismash:
antismash_paths = prepare_multisample_input_antismash(dir_antismash)
for input_antismash in antismash_paths:
summary_antismash_temp = antismash_workflow(input_antismash)
summary_antismash = pd.concat([summary_antismash, summary_antismash_temp])
else:
summary_antismash = antismash_workflow(input_antismash)
elif tool == "deepBGC":
summary_deepbgc = deepbgc_workflow(input_deepbgc)
elif tool == "GECCO":
Expand Down