From b23f359b0e9dda22d1da833bf604c781b05775a1 Mon Sep 17 00:00:00 2001 From: charlie Date: Fri, 23 Feb 2024 04:18:24 +0000 Subject: [PATCH 1/2] add KARATHOseq samples to SIFs. --- sequence_processing_pipeline/Pipeline.py | 11 ++++++++++- sequence_processing_pipeline/tests/test_Job.py | 11 +++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index c8881f2d..38af4073 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -573,7 +573,9 @@ def generate_sample_info_files(self, addl_info=None): df = pd.concat([df, addl_info], ignore_index=True).drop_duplicates() - df = df[df["sample_name"].str.startswith("BLANK") == True] # noqa + blanks = df[df["sample_name"].str.startswith("BLANK")] + katharo = df[df["sample_name"].str.startswith("KATHARO")] + df = pd.concat([blanks, katharo]) samples = list(df.to_records(index=False)) projects = df.project_name.unique() @@ -609,6 +611,13 @@ def generate_sample_info_files(self, addl_info=None): row['description'] = sample.replace('_', '.') row['collection_timestamp'] = self.get_date_from_run_id() + # Although KATHARO samples may contain valuable metadata + # in their columns, none of them appear to match the + # expected values for SIF columns. Hence, the new columns + # for KATHARO samples will be populated with the same + # defaults as BLANKs. + # (possible exception would be well_description and + # experiment_design_description -> description) row = [row[x] for x in Pipeline.sif_header] f.write('\t'.join(row) + '\n') diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py index b01e651c..ca84e7d8 100644 --- a/sequence_processing_pipeline/tests/test_Job.py +++ b/sequence_processing_pipeline/tests/test_Job.py @@ -45,10 +45,13 @@ def my_callback(jid=None, status=None): obs = job._system_call('ls ' + join(package_root, 'tests', 'bin'), callback=my_callback) - exp = {'stdout': 'bcl-convert\nbcl2fastq\nfastqc\n', - 'stderr': '', - 'return_code': 0} - self.assertDictEqual(obs, exp) + + exp = ['bcl-convert\nbcl2fastq\nfastqc\n', + 'bcl2fastq\nbcl-convert\nfastqc\n'] + + self.assertIn(obs['stdout'], exp) + self.assertEqual(obs['stderr'], '') + self.assertEqual(obs['return_code'], 0) for item in callback_results: self.assertTrue(isinstance(item[0], int)) From c5a221d1a5adc526478dfaee2a5c5d9c01ff4504 Mon Sep 17 00:00:00 2001 From: charlie Date: Fri, 23 Feb 2024 04:45:07 +0000 Subject: [PATCH 2/2] changed SIF file-naming convention from '..._blanks.tsv' to '..._wetlab.tsv', per request. --- sequence_processing_pipeline/Pipeline.py | 2 +- .../tests/test_Pipeline.py | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 38af4073..e56dd8c2 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -583,7 +583,7 @@ def generate_sample_info_files(self, addl_info=None): for project in projects: samples_in_proj = [x for x, y in samples if y == project] some_path = join(self.output_path, - f'{self.run_id}_{project}_blanks.tsv') + f'{self.run_id}_{project}_wetlab.tsv') paths.append(some_path) with open(some_path, 'w') as f: # write out header to disk diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index 5887dfe4..2a58c6ed 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -365,11 +365,11 @@ def test_generate_sample_information_files(self): # filenames. obs = [x.split('sequence_processing_pipeline/')[1] for x in paths] exp = [(f'tests/data/output_dir/{self.good_run_id}' - '_NYU_BMS_Melanoma_13059_blanks.tsv'), + '_NYU_BMS_Melanoma_13059_wetlab.tsv'), (f'tests/data/output_dir/{self.good_run_id}' - '_Feist_11661_blanks.tsv'), + '_Feist_11661_wetlab.tsv'), (f'tests/data/output_dir/{self.good_run_id}' - '_Gerwick_6123_blanks.tsv')] + '_Gerwick_6123_wetlab.tsv')] # sort the lists to ensure both are in a fixed order. obs.sort() @@ -380,13 +380,13 @@ def test_generate_sample_information_files(self): # confirm files contain the expected number of lines. # This is going to be based on the number of samples named 'BLANK*' # in good-sample-sheet.csv. - exp_lines = {f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv': + exp_lines = {f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv': 33, - f'{self.good_run_id}_Feist_11661_blanks.tsv': 8, - f'{self.good_run_id}_Gerwick_6123_blanks.tsv': 2} + f'{self.good_run_id}_Feist_11661_wetlab.tsv': 8, + f'{self.good_run_id}_Gerwick_6123_wetlab.tsv': 2} exp_first_lines = { - f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv': + f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv': 'BLANK1.1A\t2021-10-21\t193\t' 'Control\tNegative\tSterile w' 'ater blank\turban biome\tres' @@ -397,7 +397,7 @@ def test_generate_sample_information_files(self): 'genome\t256318\tBLANK1.1A\tN' 'YU_BMS_Melanoma\tTRUE\t' 'UCSD\tFALSE', - f'{self.good_run_id}_Feist_11661_blanks.tsv': + f'{self.good_run_id}_Feist_11661_wetlab.tsv': 'BLANK.40.12G\t2021-10-21\t193\tControl' '\tNegative\tSterile water blank\turban ' 'biome\tresearch facility\tsterile water' @@ -405,7 +405,7 @@ def test_generate_sample_information_files(self): 'LANK.40.12G\t32.5\t-117.25\tcontrol bla' 'nk\tmetagenome\t256318\tBLANK.40.12G\t' 'Feist\tTRUE\tUCSD\tFALSE', - f'{self.good_run_id}_Gerwick_6123_blanks.tsv': + f'{self.good_run_id}_Gerwick_6123_wetlab.tsv': 'BLANK.41.12G\t2021-10-21\t193\tControl' '\tNegative\tSterile water blank\turban' ' biome\tresearch facility\tsterile wat' @@ -416,7 +416,7 @@ def test_generate_sample_information_files(self): } exp_last_lines = { - f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv': + f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv': 'BLANK4.4H\t2021-10-21\t193\t' 'Control\tNegative\tSterile w' 'ater blank\turban biome\tres' @@ -427,7 +427,7 @@ def test_generate_sample_information_files(self): 'genome\t256318\tBLANK4.4H\tN' 'YU_BMS_Melanoma\tTRUE\t' 'UCSD\tFALSE', - f'{self.good_run_id}_Feist_11661_blanks.tsv': + f'{self.good_run_id}_Feist_11661_wetlab.tsv': 'BLANK.43.12H\t2021-10-21\t193\tControl' '\tNegative\tSterile water blank\turban' ' biome\tresearch facility\tsterile wat' @@ -435,7 +435,7 @@ def test_generate_sample_information_files(self): '\tBLANK.43.12H\t32.5\t-117.25\tcontrol' ' blank\tmetagenome\t256318\tBLANK.43.1' '2H\tFeist\tTRUE\tUCSD\tFALSE', - f'{self.good_run_id}_Gerwick_6123_blanks.tsv': + f'{self.good_run_id}_Gerwick_6123_wetlab.tsv': 'BLANK.41.12G\t2021-10-21\t193\tContro' 'l\tNegative\tSterile water blank\turb' 'an biome\tresearch facility\tsterile ' @@ -1822,7 +1822,7 @@ def test_generate_sample_information_files(self): obs = [x.split('sequence_processing_pipeline/')[1] for x in paths] exp = [(f'tests/data/output_dir/{self.good_run_id}' - '_ABTX_20230208_ABTX_11052_blanks.tsv')] + '_ABTX_20230208_ABTX_11052_wetlab.tsv')] # sort the lists to ensure both are in a fixed order. obs.sort()