From 41a509398fc4a9580e084dd0f61d406891c13f05 Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Tue, 30 Jul 2024 22:25:35 -0400 Subject: [PATCH] fixed batch mode error when one of the files fails FASTA test causing other files in a pool not being analyzed. --- ectyper/commandLineOptions.py | 5 ++--- ectyper/ectyper.py | 3 +-- ectyper/genomeFunctions.py | 2 +- ectyper/predictionFunctions.py | 4 ++-- ectyper/speciesIdentification.py | 14 +++++++------- ectyper/subprocess_util.py | 2 -- 6 files changed, 13 insertions(+), 17 deletions(-) diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py index d12012d..97e5cee 100644 --- a/ectyper/commandLineOptions.py +++ b/ectyper/commandLineOptions.py @@ -62,10 +62,9 @@ def checkdbversion(): ) parser.add_argument( - "-d", - "--maxdepth", + "--maxdirdepth", help="Maximum number of directories to descend when searching an input directory of files", - default=1e6, + default=0, type=int, required=False ) diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py index bb82252..b4b3b66 100644 --- a/ectyper/ectyper.py +++ b/ectyper/ectyper.py @@ -121,7 +121,7 @@ def run_program(): os.makedirs(temp_dir, exist_ok=True) LOG.info("Gathering genome files list ...") - input_files_list = genomeFunctions.get_files_as_list(args.input, args.maxdepth) + input_files_list = genomeFunctions.get_files_as_list(args.input, args.maxdirdepth) raw_genome_files = decompress_gunzip_files(input_files_list, temp_dir) LOG.info(f"Identifying genome file types on {len(raw_genome_files)} inputs ...") @@ -157,7 +157,6 @@ def run_program(): raw_files_dict['filesnotfound'], args) - LOG.info("Standardizing the E.coli genome headers based on file names") diff --git a/ectyper/genomeFunctions.py b/ectyper/genomeFunctions.py index f9c6c7a..122abc3 100644 --- a/ectyper/genomeFunctions.py +++ b/ectyper/genomeFunctions.py @@ -47,7 +47,7 @@ def get_files_as_list(files_or_directories, max_depth_level): LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping {file_or_directory} ...") continue - # if single directory is specified + # if directory is specified if os.path.isdir(file_or_directory): LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...") # Create a list containing the file names diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py index ea16019..272bc5b 100644 --- a/ectyper/predictionFunctions.py +++ b/ectyper/predictionFunctions.py @@ -868,7 +868,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo :param predictions_data_frame: the Dict containing the ectyper predictions :return: modified prediction file """ - + # genome names are given without the filename extension for g in all_genomes_list: gname = os.path.splitext(os.path.split(g)[1])[0] @@ -887,7 +887,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo } else: predictions_dict[gname] = { - 'error': "No O and H antigen determinant E.coli genes were found. Try running with --verify parameter", + 'error': f"No O and H antigen determinant E.coli genes were found in {gname}", 'species': ecoli_dict[gname]["species"] } diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py index ac19473..b02360c 100644 --- a/ectyper/speciesIdentification.py +++ b/ectyper/speciesIdentification.py @@ -266,21 +266,21 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args) filesnotfound_dict = {} fasta_files = fasta_fastq_files_dict.keys() + for fasta in fasta_files: sampleName = getSampleName(fasta) speciesname = "-" - if is_valid_fasta_file(fasta, sampleName) == False: - failverifyerrormessage = f"Sample {sampleName} FASTA file ({fasta}) is empty. This could happen when FASTA file generated from FASTQ input lacks raw reads mapping to O- and H- antigens database or input FASTA is empty/corrupted. Please check sequence input file of {sampleName}" - other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":failverifyerrormessage} - return ecoli_files_dict, other_files_dict, filesnotfound_dict - if sampleName in ecoli_files_dict or sampleName in other_files_dict: error_msg = "Duplicated parsed filenames found ('{}'). Offending file paths {}. Only unique file names are supported in batch mode".format( sampleName, [file for file in fasta_files if sampleName in file] ) LOG.error(error_msg) raise ValueError(error_msg) + + if is_valid_fasta_file(fasta, sampleName) == False: + failverifyerrormessage = f"Sample {sampleName} FASTA file ({fasta}) is invalid/empty. This could happen when FASTA file generated from FASTQ input lacks raw reads mapping to O- and H- antigens database or input FASTA is empty/corrupted. Please check sequence input file of {sampleName}" + #do species always regardless of --verify param. Do prediction on fastq files if available for better accuracy if fasta_fastq_files_dict[fasta]: @@ -288,7 +288,7 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args) speciesname = get_species(fastq_file, args, args.cores) else: speciesname = get_species(fasta, args, args.cores) - + if args.verify: failverifyerrormessage = "Sample identified as " + speciesname + ": serotyping results are only available for E.coli samples." \ "If sure that sample is E.coli run without --verify parameter." @@ -311,5 +311,5 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args) for file in filesnotfound: sampleName = getSampleName(file) filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file)} - + return ecoli_files_dict, other_files_dict,filesnotfound_dict \ No newline at end of file diff --git a/ectyper/subprocess_util.py b/ectyper/subprocess_util.py index 7cd78a8..fb130f2 100644 --- a/ectyper/subprocess_util.py +++ b/ectyper/subprocess_util.py @@ -37,7 +37,5 @@ def run_subprocess(cmd, input_data=None, un=False, ignorereturncode=False): else: LOG.error("Error in subprocess. The following command failed: {}".format(cmd)) LOG.error("Subprocess failed with error: \"{}\"".format(comp_proc.stderr.decode("utf-8"))) - #LOG.critical("ectyper has stopped") return comp_proc - #raise Exception(f"subprocess failure while running {cmd} command")